Total coverage: 230762 (12%)of 2011382
129 102 9 8 7 5 6 6 6 3 101 3 2 2 2 101 100 78 77 78 78 76 78 24 24 99 100 2 101 101 98 1 100 101 101 102 37 30 30 1 1 1 1 1 4 4 4 45 1 43 43 20 20 1 1 1 1 1 1 1 35 11 6 5 6 54 54 12 12 73 73 73 7 7 2 7 7 1 6 4 66 13 59 59 54 7 20 20 72 73 72 2 4 5 6 29 9 6 29 99 98 99 98 97 97 14 14 1 88 95 23 22 99 21 78 76 94 17 1 17 17 93 94 94 93 5 94 94 5 94 93 10 1 1 83 22 63 82 66 94 32 32 32 3 1 31 2 30 3 93 88 1 74 71 69 74 14 14 13 4 14 4 4 14 1 16 95 90 26 1 7 6 6 1 1 1 1 1 1 23 23 101 25 101 20 101 26 101 19 101 20 21 31 71 71 71 33 376 374 31 1 31 27 4 23 23 23 23 23 23 23 23 23 23 23 23 3 1 2 1 2 50 50 50 50 50 24 21 24 50 41 36 36 36 35 31 5 45 43 1 1 43 39 15 38 1 37 37 37 35 25 4 33 6 7 1 30 31 27 6 4 35 4 4 4 4 30 27 31 31 25 25 31 1 30 2 30 31 1 31 17 15 17 12 11 11 3 10 9 12 6 437 26 26 26 26 1 26 26 26 4 26 63 61 61 13 56 2 55 55 55 1 55 56 61 4 1 4 1 4 22 52 1 52 52 51 10 1 52 10 6 48 18 17 2 30 29 28 7 2 5 3 2 1 4 4 5 47 2 2 3 6 3 2 3 3 4 4 1 2 1 4 5 4 1 2 1 4 37 21 37 8 7 7 6 1 1 1 1 1 8 8 4 4 49 21 49 23 17 49 49 4 4 4 3 6 4 6 6 6 6 5 4 4 4 4 6 3 1 1 3 3 3 3 3 6 2 1 2 6 6 6 6 3 6 6 6 6 2 2 3 3 2 2 3 4 4 347 347 10 347 347 347 347 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The User Datagram Protocol (UDP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Hirokazu Takahashi, <taka@valinux.co.jp> * * Fixes: * Alan Cox : verify_area() calls * Alan Cox : stopped close while in use off icmp * messages. Not a fix but a botch that * for udp at least is 'valid'. * Alan Cox : Fixed icmp handling properly * Alan Cox : Correct error for oversized datagrams * Alan Cox : Tidied select() semantics. * Alan Cox : udp_err() fixed properly, also now * select and read wake correctly on errors * Alan Cox : udp_send verify_area moved to avoid mem leak * Alan Cox : UDP can count its memory * Alan Cox : send to an unknown connection causes * an ECONNREFUSED off the icmp, but * does NOT close. * Alan Cox : Switched to new sk_buff handlers. No more backlog! * Alan Cox : Using generic datagram code. Even smaller and the PEEK * bug no longer crashes it. * Fred Van Kempen : Net2e support for sk->broadcast. * Alan Cox : Uses skb_free_datagram * Alan Cox : Added get/set sockopt support. * Alan Cox : Broadcasting without option set returns EACCES. * Alan Cox : No wakeup calls. Instead we now use the callbacks. * Alan Cox : Use ip_tos and ip_ttl * Alan Cox : SNMP Mibs * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support. * Matt Dillon : UDP length checks. * Alan Cox : Smarter af_inet used properly. * Alan Cox : Use new kernel side addressing. * Alan Cox : Incorrect return on truncated datagram receive. * Arnt Gulbrandsen : New udp_send and stuff * Alan Cox : Cache last socket * Alan Cox : Route cache * Jon Peatfield : Minor efficiency fix to sendto(). * Mike Shaver : RFC1122 checks. * Alan Cox : Nonblocking error fix. * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * David S. Miller : New socket lookup architecture. * Last socket cache retained as it * does have a high hit rate. * Olaf Kirch : Don't linearise iovec on sendmsg. * Andi Kleen : Some cleanups, cache destination entry * for connect. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Melvin Smith : Check msg_name not msg_namelen in sendto(), * return ENOTCONN for unconnected sockets (POSIX) * Janos Farkas : don't deliver multi/broadcasts to a different * bound-to-device socket * Hirokazu Takahashi : HW checksumming for outgoing UDP * datagrams. * Hirokazu Takahashi : sendfile() on UDP works now. * Arnaldo C. Melo : convert /proc/net/udp to seq_file * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * Derek Atkins <derek@ihtfp.com>: Add Encapsulation Support * James Chapman : Add L2TP encapsulation type. */ #define pr_fmt(fmt) "UDP: " fmt #include <linux/bpf-cgroup.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/igmp.h> #include <linux/inetdevice.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/sock_diag.h> #include <net/tcp_states.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/net_namespace.h> #include <net/icmp.h> #include <net/inet_hashtables.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/route.h> #include <net/checksum.h> #include <net/gso.h> #include <net/xfrm.h> #include <trace/events/udp.h> #include <linux/static_key.h> #include <linux/btf_ids.h> #include <trace/events/skb.h> #include <net/busy_poll.h> #include "udp_impl.h" #include <net/sock_reuseport.h> #include <net/addrconf.h> #include <net/udp_tunnel.h> #include <net/gro.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6_stubs.h> #endif #include <net/rps.h> struct udp_table udp_table __read_mostly; long sysctl_udp_mem[3] __read_mostly; EXPORT_IPV6_MOD(sysctl_udp_mem); DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET) static struct udp_table *udp_get_table_prot(struct sock *sk) { return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table; } static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, struct sock *sk, unsigned int log) { kuid_t uid = sk_uid(sk); struct sock *sk2; sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(uid, sk_uid(sk2))) { if (!bitmap) return 0; } else { if (!bitmap) return 1; __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); } } } return 0; } /* * Note: we still hold spinlock of primary hash chain, so no other writer * can insert/delete a socket with local_port == num */ static int udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, struct sock *sk) { kuid_t uid = sk_uid(sk); struct sock *sk2; int res = 0; spin_lock(&hslot2->lock); udp_portaddr_for_each_entry(sk2, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(uid, sk_uid(sk2))) { res = 0; } else { res = 1; } break; } } spin_unlock(&hslot2->lock); return res; } static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) { struct net *net = sock_net(sk); kuid_t uid = sk_uid(sk); struct sock *sk2; sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); } } return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * * @sk: socket struct in question * @snum: port number to look up * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr) { struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; struct net *net = sock_net(sk); int error = -EADDRINUSE; if (!snum) { DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); unsigned short first, last; int low, high, remaining; unsigned int rand; inet_sk_get_local_port_range(sk, &low, &high); remaining = (high - low) + 1; rand = get_random_u32(); first = reciprocal_scale(rand, remaining) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE */ rand = (rand | 1) * (udptable->mask + 1); last = first + udptable->mask + 1; do { hslot = udp_hashslot(udptable, net, first); bitmap_zero(bitmap, PORTS_PER_CHAIN); spin_lock_bh(&hslot->lock); udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, udptable->log); snum = first; /* * Iterate on all possible values of snum for this hash. * Using steps of an odd multiple of UDP_HTABLE_SIZE * give us randomization and full range coverage. */ do { if (low <= snum && snum <= high && !test_bit(snum >> udptable->log, bitmap) && !inet_is_local_reserved_port(net, snum)) goto found; snum += rand; } while (snum != first); spin_unlock_bh(&hslot->lock); cond_resched(); } while (++first != last); goto fail; } else { hslot = udp_hashslot(udptable, net, snum); spin_lock_bh(&hslot->lock); if (hslot->count > 10) { int exist; unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum; slot2 &= udptable->mask; hash2_nulladdr &= udptable->mask; hslot2 = udp_hashslot2(udptable, slot2); if (hslot->count < hslot2->count) goto scan_primary_hash; exist = udp_lib_lport_inuse2(net, snum, hslot2, sk); if (!exist && (hash2_nulladdr != slot2)) { hslot2 = udp_hashslot2(udptable, hash2_nulladdr); exist = udp_lib_lport_inuse2(net, snum, hslot2, sk); } if (exist) goto fail_unlock; else goto found; } scan_primary_hash: if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0)) goto fail_unlock; } found: inet_sk(sk)->inet_num = snum; udp_sk(sk)->udp_port_hash = snum; udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { if (sk->sk_reuseport && udp_reuseport_add_sock(sk, hslot)) { inet_sk(sk)->inet_num = 0; udp_sk(sk)->udp_port_hash = 0; udp_sk(sk)->udp_portaddr_hash ^= snum; goto fail_unlock; } sock_set_flag(sk, SOCK_RCU_FREE); sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); else hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); fail: return error; } EXPORT_IPV6_MOD(udp_lib_get_port); int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); unsigned int hash2_partial = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; return udp_lib_get_port(sk, snum, hash2_nulladdr); } static int compute_score(struct sock *sk, const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, int dif, int sdif) { int score; struct inet_sock *inet; bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || ipv6_only_sock(sk)) return -1; if (sk->sk_rcv_saddr != daddr) return -1; score = (sk->sk_family == PF_INET) ? 2 : 1; inet = inet_sk(sk); if (inet->inet_daddr) { if (inet->inet_daddr != saddr) return -1; score += 4; } if (inet->inet_dport) { if (inet->inet_dport != sport) return -1; score += 4; } dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); if (!dev_match) return -1; if (sk->sk_bound_dev_if) score += 4; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; return score; } u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret)); return __inet_ehashfn(laddr, lport, faddr, fport, udp_ehash_secret + net_hash_mix(net)); } EXPORT_IPV6_MOD(udp_ehashfn); /** * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port) * @net: Network namespace * @saddr: Source address, network order * @sport: Source port, network order * @daddr: Destination address, network order * @hnum: Destination port, host order * @dif: Destination interface index * @sdif: Destination bridge port index, if relevant * @udptable: Set of UDP hash tables * * Simplified lookup to be used as fallback if no sockets are found due to a * potential race between (receive) address change, and lookup happening before * the rehash operation. This function ignores SO_REUSEPORT groups while scoring * result sockets, because if we have one, we don't need the fallback at all. * * Called under rcu_read_lock(). * * Return: socket with highest matching score if any, NULL if none */ static struct sock *udp4_lib_lookup1(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, int sdif, const struct udp_table *udptable) { unsigned int slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot = &udptable->hash[slot]; struct sock *sk, *result = NULL; int score, badness = 0; sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { result = sk; badness = score; } } return result; } /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, int sdif, struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; int score, badness; bool need_rescore; result = NULL; badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { need_rescore = false; rescore: score = compute_score(need_rescore ? result : sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { badness = score; if (need_rescore) continue; if (sk->sk_state == TCP_ESTABLISHED) { result = sk; continue; } result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), saddr, sport, daddr, hnum, udp_ehashfn); if (!result) { result = sk; continue; } /* Fall back to scoring if group has connections */ if (!reuseport_has_conns(sk)) return result; /* Reuseport logic returned an error, keep original score. */ if (IS_ERR(result)) continue; /* compute_score is too long of a function to be * inlined, and calling it again here yields * measurable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'. */ need_rescore = true; goto rescore; } } return result; } #if IS_ENABLED(CONFIG_BASE_SMALL) static struct sock *udp4_lib_lookup4(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, int sdif, struct udp_table *udptable) { return NULL; } static void udp_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4) { } static void udp_unhash4(struct udp_table *udptable, struct sock *sk) { } #else /* !CONFIG_BASE_SMALL */ static struct sock *udp4_lib_lookup4(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, int sdif, struct udp_table *udptable) { const __portpair ports = INET_COMBINED_PORTS(sport, hnum); const struct hlist_nulls_node *node; struct udp_hslot *hslot4; unsigned int hash4, slot; struct udp_sock *up; struct sock *sk; hash4 = udp_ehashfn(net, daddr, hnum, saddr, sport); slot = hash4 & udptable->mask; hslot4 = &udptable->hash4[slot]; INET_ADDR_COOKIE(acookie, saddr, daddr); begin: /* SLAB_TYPESAFE_BY_RCU not used, so we don't need to touch sk_refcnt */ udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) { sk = (struct sock *)up; if (inet_match(net, sk, acookie, ports, dif, sdif)) return sk; } /* if the nulls value we got at the end of this lookup is not the * expected one, we must restart lookup. We probably met an item that * was moved to another chain due to rehash. */ if (get_nulls_value(node) != slot) goto begin; return NULL; } /* udp_rehash4() only checks hslot4, and hash4_cnt is not processed. */ static void udp_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4) { struct udp_hslot *hslot4, *nhslot4; hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash); nhslot4 = udp_hashslot4(udptable, newhash4); udp_sk(sk)->udp_lrpa_hash = newhash4; if (hslot4 != nhslot4) { spin_lock_bh(&hslot4->lock); hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node); hslot4->count--; spin_unlock_bh(&hslot4->lock); spin_lock_bh(&nhslot4->lock); hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, &nhslot4->nulls_head); nhslot4->count++; spin_unlock_bh(&nhslot4->lock); } } static void udp_unhash4(struct udp_table *udptable, struct sock *sk) { struct udp_hslot *hslot2, *hslot4; if (udp_hashed4(sk)) { hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash); spin_lock(&hslot4->lock); hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node); hslot4->count--; spin_unlock(&hslot4->lock); spin_lock(&hslot2->lock); udp_hash4_dec(hslot2); spin_unlock(&hslot2->lock); } } void udp_lib_hash4(struct sock *sk, u16 hash) { struct udp_hslot *hslot, *hslot2, *hslot4; struct net *net = sock_net(sk); struct udp_table *udptable; /* Connected udp socket can re-connect to another remote address, which * will be handled by rehash. Thus no need to redo hash4 here. */ if (udp_hashed4(sk)) return; udptable = net->ipv4.udp_table; hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); hslot4 = udp_hashslot4(udptable, hash); udp_sk(sk)->udp_lrpa_hash = hash; spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); spin_lock(&hslot4->lock); hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, &hslot4->nulls_head); hslot4->count++; spin_unlock(&hslot4->lock); spin_lock(&hslot2->lock); udp_hash4_inc(hslot2); spin_unlock(&hslot2->lock); spin_unlock_bh(&hslot->lock); } EXPORT_IPV6_MOD(udp_lib_hash4); /* call with sock lock */ void udp4_hash4(struct sock *sk) { struct net *net = sock_net(sk); unsigned int hash; if (sk_unhashed(sk) || sk->sk_rcv_saddr == htonl(INADDR_ANY)) return; hash = udp_ehashfn(net, sk->sk_rcv_saddr, sk->sk_num, sk->sk_daddr, sk->sk_dport); udp_lib_hash4(sk, hash); } EXPORT_IPV6_MOD(udp4_hash4); #endif /* CONFIG_BASE_SMALL */ /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); struct udp_hslot *hslot2; struct sock *result, *sk; unsigned int hash2; hash2 = ipv4_portaddr_hash(net, daddr, hnum); hslot2 = udp_hashslot2(udptable, hash2); if (udp_has_hash4(hslot2)) { result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum, dif, sdif, udptable); if (result) /* udp4_lib_lookup4 return sk or NULL */ return result; } /* Lookup connected or non-wildcard socket */ result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, hslot2, skb); if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) goto done; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) && udptable == net->ipv4.udp_table) { sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr), saddr, sport, daddr, hnum, dif, udp_ehashfn); if (sk) { result = sk; goto done; } } /* Got non-wildcard socket or error on first lookup */ if (result) goto done; /* Lookup wildcard sockets */ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); hslot2 = udp_hashslot2(udptable, hash2); result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, hslot2, skb); if (!IS_ERR_OR_NULL(result)) goto done; /* Primary hash (destination port) lookup as fallback for this race: * 1. __ip4_datagram_connect() sets sk_rcv_saddr * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet * 3. rehash operation updating _secondary and four-tuple_ hashes * The primary hash doesn't need an update after 1., so, thanks to this * further step, 1. and 3. don't need to be atomic against the lookup. */ result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif, udptable); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(__udp4_lib_lookup); static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct udp_table *udptable) { const struct iphdr *iph = ip_hdr(skb); return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), inet_sdif(skb), udptable, skb); } struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct net *net = dev_net(skb->dev); int iif, sdif; inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, iph->daddr, dport, iif, sdif, net->ipv4.udp_table, NULL); } /* Must be called under rcu_read_lock(). * Does increment socket refcount. */ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4) struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { struct sock *sk; sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, 0, net->ipv4.udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } EXPORT_SYMBOL_GPL(udp4_lib_lookup); #endif static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif, unsigned short hnum) { const struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || (inet->inet_daddr && inet->inet_daddr != rmt_addr) || (inet->inet_dport != rmt_port && inet->inet_dport) || (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || ipv6_only_sock(sk) || !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return false; if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif)) return false; return true; } DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); EXPORT_IPV6_MOD(udp_encap_needed_key); #if IS_ENABLED(CONFIG_IPV6) DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); EXPORT_IPV6_MOD(udpv6_encap_needed_key); #endif void udp_encap_enable(void) { static_branch_inc(&udp_encap_needed_key); } EXPORT_SYMBOL(udp_encap_enable); void udp_encap_disable(void) { static_branch_dec(&udp_encap_needed_key); } EXPORT_SYMBOL(udp_encap_disable); /* Handler for tunnels with arbitrary destination ports: no socket lookup, go * through error handlers in encapsulations looking for a match. */ static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) { int i; for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, u32 info); const struct ip_tunnel_encap_ops *encap; encap = rcu_dereference(iptun_encaps[i]); if (!encap) continue; handler = encap->err_handler; if (handler && !handler(skb, info)) return 0; } return -ENOENT; } /* Try to match ICMP errors to UDP tunnels by looking up a socket without * reversing source and destination port: this will match tunnels that force the * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that * lwtunnels might actually break this assumption by being configured with * different destination ports on endpoints, in this case we won't be able to * trace ICMP messages back to them. * * If this doesn't match any socket, probe tunnels with arbitrary destination * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port * we've sent packets to won't necessarily match the local destination port. * * Then ask the tunnel implementation to match the error against a valid * association. * * Return an error if we can't find a match, the socket if we need further * processing, zero otherwise. */ static struct sock *__udp4_lib_err_encap(struct net *net, const struct iphdr *iph, struct udphdr *uh, struct udp_table *udptable, struct sock *sk, struct sk_buff *skb, u32 info) { int (*lookup)(struct sock *sk, struct sk_buff *skb); int network_offset, transport_offset; struct udp_sock *up; network_offset = skb_network_offset(skb); transport_offset = skb_transport_offset(skb); /* Network header needs to point to the outer IPv4 header inside ICMP */ skb_reset_network_header(skb); /* Transport header needs to point to the UDP header */ skb_set_transport_header(skb, iph->ihl << 2); if (sk) { up = udp_sk(sk); lookup = READ_ONCE(up->encap_err_lookup); if (lookup && lookup(sk, skb)) sk = NULL; goto out; } sk = __udp4_lib_lookup(net, iph->daddr, uh->source, iph->saddr, uh->dest, skb->dev->ifindex, 0, udptable, NULL); if (sk) { up = udp_sk(sk); lookup = READ_ONCE(up->encap_err_lookup); if (!lookup || lookup(sk, skb)) sk = NULL; } out: if (!sk) sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info)); skb_set_transport_header(skb, transport_offset); skb_set_network_header(skb, network_offset); return sk; } /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. * Header points to the ip header of the error packet. We move * on past this. Then (as it used to claim before adjustment) * header points to the first 8 bytes of the udp header. We need * to find the appropriate port. */ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; const struct iphdr *iph = (const struct iphdr *)skb->data; struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; bool tunnel = false; struct sock *sk; int harderr; int err; struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, inet_sdif(skb), udptable, NULL); if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udp_encap_needed_key)) { sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb, info); if (!sk) return 0; } else sk = ERR_PTR(-ENOENT); if (IS_ERR(sk)) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return PTR_ERR(sk); } tunnel = true; } err = 0; harderr = 0; inet = inet_sk(sk); switch (type) { default: case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: goto out; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ ipv4_sk_update_pmtu(skb, sk, info); if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) { err = EMSGSIZE; harderr = 1; break; } goto out; } err = EHOSTUNREACH; if (code <= NR_ICMP_UNREACH) { harderr = icmp_err_convert[code].fatal; err = icmp_err_convert[code].errno; } break; case ICMP_REDIRECT: ipv4_sk_redirect(skb, sk); goto out; } /* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ if (tunnel) { /* ...not for tunnels though: we don't have a sending socket */ if (udp_sk(sk)->encap_err_rcv) udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); goto out; } if (!inet_test_bit(RECVERR, sk)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); sk->sk_err = err; sk_error_report(sk); out: return 0; } int udp_err(struct sk_buff *skb, u32 info) { return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table); } /* * Throw away all pending data and cancel the corking. Socket is locked. */ void udp_flush_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); if (up->pending) { up->len = 0; WRITE_ONCE(up->pending, 0); ip_flush_pending_frames(sk); } } EXPORT_IPV6_MOD(udp_flush_pending_frames); /** * udp4_hwcsum - handle outgoing HW checksumming * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) * @src: source IP address * @dst: destination IP address */ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { struct udphdr *uh = udp_hdr(skb); int offset = skb_transport_offset(skb); int len = skb->len - offset; int hlen = len; __wsum csum = 0; if (!skb_has_frag_list(skb)) { /* * Only one fragment on the socket. */ skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); } else { struct sk_buff *frags; /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ skb_walk_frags(skb, frags) { csum = csum_add(csum, frags->csum); hlen -= frags->len; } csum = skb_checksum(skb, offset, hlen, csum); skb->ip_summed = CHECKSUM_NONE; uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } } EXPORT_SYMBOL_GPL(udp4_hwcsum); /* Function to set UDP checksum for an IPv4 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel. */ void udp_set_csum(bool nocheck, struct sk_buff *skb, __be32 saddr, __be32 daddr, int len) { struct udphdr *uh = udp_hdr(skb); if (nocheck) { uh->check = 0; } else if (skb_is_gso(skb)) { uh->check = ~udp_v4_check(len, saddr, daddr, 0); } else if (skb->ip_summed == CHECKSUM_PARTIAL) { uh->check = 0; uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v4_check(len, saddr, daddr, 0); } } EXPORT_SYMBOL(udp_set_csum); static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, struct inet_cork *cork) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct udphdr *uh; int err; int is_udplite = IS_UDPLITE(sk); int offset = skb_transport_offset(skb); int len = skb->len - offset; int datalen = len - sizeof(*uh); __wsum csum = 0; /* * Create a UDP header */ uh = udp_hdr(skb); uh->source = inet->inet_sport; uh->dest = fl4->fl4_dport; uh->len = htons(len); uh->check = 0; if (cork->gso_size) { const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); return -EINVAL; } if (sk->sk_no_check_tx) { kfree_skb(skb); return -EINVAL; } if (is_udplite || dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } if (datalen > cork->gso_size) { skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, cork->gso_size); /* Don't checksum the payload, skb will get segmented */ goto csum_partial; } } if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); else if (sk->sk_no_check_tx) { /* UDP csum off */ skb->ip_summed = CHECKSUM_NONE; goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ csum_partial: udp4_hwcsum(skb, fl4->saddr, fl4->daddr); goto send; } else csum = udp_csum(skb); /* add protocol-dependent pseudo-header */ uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len, sk->sk_protocol, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; send: err = ip_send_skb(sock_net(sk), skb); if (err) { if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; } } else UDP_INC_STATS(sock_net(sk), UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } /* * Push out all pending data as one UDP datagram. Socket is locked. */ int udp_push_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct flowi4 *fl4 = &inet->cork.fl.u.ip4; struct sk_buff *skb; int err = 0; skb = ip_finish_skb(sk, fl4); if (!skb) goto out; err = udp_send_skb(skb, fl4, &inet->cork.base); out: up->len = 0; WRITE_ONCE(up->pending, 0); return err; } EXPORT_IPV6_MOD(udp_push_pending_frames); static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size) { switch (cmsg->cmsg_type) { case UDP_SEGMENT: if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16))) return -EINVAL; *gso_size = *(__u16 *)CMSG_DATA(cmsg); return 0; default: return -EINVAL; } } int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size) { struct cmsghdr *cmsg; bool need_ip = false; int err; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_UDP) { need_ip = true; continue; } err = __udp_cmsg_send(cmsg, gso_size); if (err) return err; } return need_ip; } EXPORT_IPV6_MOD_GPL(udp_cmsg_send); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); struct flowi4 fl4_stack; struct flowi4 *fl4; int ulen = len; struct ipcm_cookie ipc; struct rtable *rt = NULL; int free = 0; int connected = 0; __be32 daddr, faddr, saddr; u8 scope; __be16 dport; int err, is_udplite = IS_UDPLITE(sk); int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; int uc_index; if (len > 0xFFFF) return -EMSGSIZE; /* * Check the flags. */ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; if (READ_ONCE(up->pending)) { /* * There are pending frames. * The socket lock must be held while it's corked. */ lock_sock(sk); if (likely(up->pending)) { if (unlikely(up->pending != AF_INET)) { release_sock(sk); return -EINVAL; } goto do_append_data; } release_sock(sk); } ulen += sizeof(struct udphdr); /* * Get and verify the address. */ if (usin) { if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { if (usin->sin_family != AF_UNSPEC) return -EAFNOSUPPORT; } daddr = usin->sin_addr.s_addr; dport = usin->sin_port; if (dport == 0) return -EINVAL; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = inet->inet_daddr; dport = inet->inet_dport; /* Open fast path for connected socket. Route will not be used, if at least one option is set. */ connected = 1; } ipcm_init_sk(&ipc, inet); ipc.gso_size = READ_ONCE(up->gso_size); if (msg->msg_controllen) { err = udp_cmsg_send(sk, msg, &ipc.gso_size); if (err > 0) { err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); connected = 0; } if (unlikely(err < 0)) { kfree(ipc.opt); return err; } if (ipc.opt) free = 1; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &msg->msg_namelen, &ipc.addr); if (err) goto out_free; if (usin) { if (usin->sin_port == 0) { /* BPF program set invalid port. Reject it. */ err = -EINVAL; goto out_free; } daddr = usin->sin_addr.s_addr; dport = usin->sin_port; } } saddr = ipc.addr; ipc.addr = faddr = daddr; if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; goto out_free; } faddr = ipc.opt->opt.faddr; connected = 0; } scope = ip_sendmsg_scope(inet, &ipc, msg); if (scope == RT_SCOPE_LINK) connected = 0; uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); connected = 0; } else if (!ipc.oif) { ipc.oif = uc_index; } else if (ipv4_is_lbcast(daddr) && uc_index) { /* oif is set, packet is to local broadcast and * uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. */ if (ipc.oif != uc_index && ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), uc_index)) { ipc.oif = uc_index; } } if (connected) rt = dst_rtable(sk_dst_check(sk, 0)); if (!rt) { struct net *net = sock_net(sk); __u8 flow_flags = inet_sk_flowi_flags(sk); fl4 = &fl4_stack; flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, sk_uid(sk)); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); goto out; } err = -EACCES; if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) goto out; if (connected) sk_dst_set(sk, dst_clone(&rt->dst)); } if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: saddr = fl4->saddr; if (!ipc.addr) daddr = ipc.addr = fl4->daddr; /* Lockless fast path for the non-corking case. */ if (!corkreq) { struct inet_cork cork; skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, &cork, msg->msg_flags); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_send_skb(skb, fl4, &cork); goto out; } lock_sock(sk); if (unlikely(up->pending)) { /* The socket is already corked while preparing it. */ /* ... which is an evident application bug. --ANK */ release_sock(sk); net_dbg_ratelimited("socket already corked\n"); err = -EINVAL; goto out; } /* * Now cork the socket to pend data. */ fl4 = &inet->cork.fl.u.ip4; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = inet->inet_sport; WRITE_ONCE(up->pending, AF_INET); do_append_data: up->len += ulen; err = ip_append_data(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_flush_pending_frames(sk); else if (!corkreq) err = udp_push_pending_frames(sk); else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) WRITE_ONCE(up->pending, 0); release_sock(sk); out: ip_rt_put(rt); out_free: if (free) kfree(ipc.opt); if (!err) return len; /* * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting * ENOBUFS might not be good (it's not tunable per se), but otherwise * we don't have a good statistic (IpOutDiscards but it can be too many * things). We could add another new stat but at least for now that * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); } return err; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4->daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; goto out; } EXPORT_SYMBOL(udp_sendmsg); void udp_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct udp_sock *up = udp_sk(sk); if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk)) return; lock_sock(sk); if (up->pending && !udp_test_bit(CORK, sk)) udp_push_pending_frames(sk); release_sock(sk); } EXPORT_IPV6_MOD_GPL(udp_splice_eof); #define UDP_SKB_IS_STATELESS 0x80000000 /* all head states (dst, sk, nf conntrack) except skb extensions are * cleared by udp_rcv(). * * We need to preserve secpath, if present, to eventually process * IP_CMSG_PASSSEC at recvmsg() time. * * Other extensions can be cleared. */ static bool udp_try_make_stateless(struct sk_buff *skb) { if (!skb_has_extensions(skb)) return true; if (!secpath_exists(skb)) { skb_ext_reset(skb); return true; } return false; } static void udp_set_dev_scratch(struct sk_buff *skb) { struct udp_dev_scratch *scratch = udp_skb_scratch(skb); BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long)); scratch->_tsize_state = skb->truesize; #if BITS_PER_LONG == 64 scratch->len = skb->len; scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); #endif if (udp_try_make_stateless(skb)) scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } static void udp_skb_csum_unnecessary_set(struct sk_buff *skb) { /* We come here after udp_lib_checksum_complete() returned 0. * This means that __skb_checksum_complete() might have * set skb->csum_valid to 1. * On 64bit platforms, we can set csum_unnecessary * to true, but only if the skb is not shared. */ #if BITS_PER_LONG == 64 if (!skb_shared(skb)) udp_skb_scratch(skb)->csum_unnecessary = true; #endif } static int udp_skb_truesize(struct sk_buff *skb) { return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; } static bool udp_skb_has_head_state(struct sk_buff *skb) { return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS); } /* fully reclaim rmem/fwd memory allocated for skb */ static void udp_rmem_release(struct sock *sk, unsigned int size, int partial, bool rx_queue_lock_held) { struct udp_sock *up = udp_sk(sk); struct sk_buff_head *sk_queue; unsigned int amt; if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; if (size < READ_ONCE(up->forward_threshold) && !skb_queue_empty(&up->reader_queue)) return; } else { size += up->forward_deficit; } up->forward_deficit = 0; /* acquire the sk_receive_queue for fwd allocated memory scheduling, * if the called don't held it already */ sk_queue = &sk->sk_receive_queue; if (!rx_queue_lock_held) spin_lock(&sk_queue->lock); amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); sk_forward_alloc_add(sk, size - amt); if (amt) __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); atomic_sub(size, &sk->sk_rmem_alloc); /* this can save us from acquiring the rx queue lock on next receive */ skb_queue_splice_tail_init(sk_queue, &up->reader_queue); if (!rx_queue_lock_held) spin_unlock(&sk_queue->lock); } /* Note: called with reader_queue.lock held. * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch * This avoids a cache line miss while receive_queue lock is held. * Look at __udp_enqueue_schedule_skb() to find where this copy is done. */ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) { prefetch(&skb->data); udp_rmem_release(sk, udp_skb_truesize(skb), 1, false); } EXPORT_IPV6_MOD(udp_skb_destructor); /* as above, but the caller held the rx queue lock, too */ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) { prefetch(&skb->data); udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } static int udp_rmem_schedule(struct sock *sk, int size) { int delta; delta = size - sk->sk_forward_alloc; if (delta > 0 && !__sk_mem_schedule(sk, delta, SK_MEM_RECV)) return -ENOBUFS; return 0; } int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; struct udp_prod_queue *udp_prod_queue; struct sk_buff *next, *to_drop = NULL; struct llist_node *ll_list; unsigned int rmem, rcvbuf; int size, err = -ENOMEM; int total_size = 0; int q_size = 0; int dropcount; int nb = 0; rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); size = skb->truesize; udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()]; rmem += atomic_read(&udp_prod_queue->rmem_alloc); /* Immediately drop when the receive queue is full. * Cast to unsigned int performs the boundary check for INT_MAX. */ if (rmem + size > rcvbuf) { if (rcvbuf > INT_MAX >> 1) goto drop; /* Accept the packet if queue is empty. */ if (rmem) goto drop; } /* Under mem pressure, it might be helpful to help udp_recvmsg() * having linear skbs : * - Reduce memory overhead and thus increase receive queue capacity * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing) */ if (rmem > (rcvbuf >> 1)) { skb_condense(skb); size = skb->truesize; } udp_set_dev_scratch(skb); atomic_add(size, &udp_prod_queue->rmem_alloc); if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root)) return 0; dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0; spin_lock(&list->lock); ll_list = llist_del_all(&udp_prod_queue->ll_root); ll_list = llist_reverse_order(ll_list); llist_for_each_entry_safe(skb, next, ll_list, ll_node) { size = udp_skb_truesize(skb); total_size += size; err = udp_rmem_schedule(sk, size); if (unlikely(err)) { /* Free the skbs outside of locked section. */ skb->next = to_drop; to_drop = skb; continue; } q_size += size; sk_forward_alloc_add(sk, -size); /* no need to setup a destructor, we will explicitly release the * forward allocated memory on dequeue */ SOCK_SKB_CB(skb)->dropcount = dropcount; nb++; __skb_queue_tail(list, skb); } atomic_add(q_size, &sk->sk_rmem_alloc); spin_unlock(&list->lock); if (!sock_flag(sk, SOCK_DEAD)) { /* Multiple threads might be blocked in recvmsg(), * using prepare_to_wait_exclusive(). */ while (nb) { INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); nb--; } } if (unlikely(to_drop)) { for (nb = 0; to_drop != NULL; nb++) { skb = to_drop; to_drop = skb->next; skb_mark_not_on_list(skb); /* TODO: update SNMP values. */ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM); } numa_drop_add(&udp_sk(sk)->drop_counters, nb); } atomic_sub(total_size, &udp_prod_queue->rmem_alloc); return 0; drop: udp_drops_inc(sk); return err; } EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); void udp_destruct_common(struct sock *sk) { /* reclaim completely the forward allocated memory */ struct udp_sock *up = udp_sk(sk); unsigned int total = 0; struct sk_buff *skb; skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue); while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) { total += skb->truesize; kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); kfree(up->udp_prod_queue); } EXPORT_IPV6_MOD_GPL(udp_destruct_common); static void udp_destruct_sock(struct sock *sk) { udp_destruct_common(sk); inet_sock_destruct(sk); } int udp_init_sock(struct sock *sk) { int res = udp_lib_init_sock(sk); sk->sk_destruct = udp_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); return res; } void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) { if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset))) sk_peek_offset_bwd(sk, len); if (!skb_shared(skb)) { skb_attempt_defer_free(skb); return; } if (!skb_unref(skb)) return; /* In the more common cases we cleared the head states previously, * see __udp_queue_rcv_skb(). */ if (unlikely(udp_skb_has_head_state(skb))) skb_release_head_state(skb); __consume_stateless_skb(skb); } EXPORT_IPV6_MOD_GPL(skb_consume_udp); static struct sk_buff *__first_packet_length(struct sock *sk, struct sk_buff_head *rcvq, unsigned int *total) { struct sk_buff *skb; while ((skb = skb_peek(rcvq)) != NULL) { if (udp_lib_checksum_complete(skb)) { __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); udp_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); } else { udp_skb_csum_unnecessary_set(skb); break; } } return skb; } /** * first_packet_length - return length of first packet in receive queue * @sk: socket * * Drops all bad checksum frames, until a valid one is found. * Returns the length of found skb, or -1 if none is found. */ static int first_packet_length(struct sock *sk) { struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; struct sk_buff_head *sk_queue = &sk->sk_receive_queue; unsigned int total = 0; struct sk_buff *skb; int res; spin_lock_bh(&rcvq->lock); skb = __first_packet_length(sk, rcvq, &total); if (!skb && !skb_queue_empty_lockless(sk_queue)) { spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, rcvq); spin_unlock(&sk_queue->lock); skb = __first_packet_length(sk, rcvq, &total); } res = skb ? skb->len : -1; if (total) udp_rmem_release(sk, total, 1, false); spin_unlock_bh(&rcvq->lock); return res; } /* * IOCTL requests applicable to the UDP protocol */ int udp_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { *karg = max_t(int, 0, first_packet_length(sk)); return 0; } default: return -ENOIOCTLCMD; } return 0; } EXPORT_IPV6_MOD(udp_ioctl); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off, int *err) { struct sk_buff_head *sk_queue = &sk->sk_receive_queue; struct sk_buff_head *queue; struct sk_buff *last; long timeo; int error; queue = &udp_sk(sk)->reader_queue; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { struct sk_buff *skb; error = sock_error(sk); if (error) break; error = -EAGAIN; do { spin_lock_bh(&queue->lock); skb = __skb_try_recv_from_queue(queue, flags, off, err, &last); if (skb) { if (!(flags & MSG_PEEK)) udp_skb_destructor(sk, skb); spin_unlock_bh(&queue->lock); return skb; } if (skb_queue_empty_lockless(sk_queue)) { spin_unlock_bh(&queue->lock); goto busy_check; } /* refill the reader queue and walk it again * keep both queues locked to avoid re-acquiring * the sk_receive_queue lock if fwd memory scheduling * is needed. */ spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, queue); skb = __skb_try_recv_from_queue(queue, flags, off, err, &last); if (skb && !(flags & MSG_PEEK)) udp_skb_dtor_locked(sk, skb); spin_unlock(&sk_queue->lock); spin_unlock_bh(&queue->lock); if (skb) return skb; busy_check: if (!sk_can_busy_loop(sk)) break; sk_busy_loop(sk, flags & MSG_DONTWAIT); } while (!skb_queue_empty_lockless(sk_queue)); /* sk_queue is empty, reader_queue may contain peeked packets */ } while (timeo && !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, &error, &timeo, (struct sk_buff *)sk_queue)); *err = error; return NULL; } EXPORT_SYMBOL(__skb_recv_udp); int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; int err; try_again: skb = skb_recv_udp(sk, MSG_DONTWAIT, &err); if (!skb) return err; if (udp_lib_checksum_complete(skb)) { int is_udplite = IS_UDPLITE(sk); struct net *net = sock_net(sk); __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); udp_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); return recv_actor(sk, skb); } EXPORT_IPV6_MOD(udp_read_skb); /* * This should be easy, if there is something there we * return it, otherwise we block. */ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; int off, err, peeking = flags & MSG_PEEK; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; if (flags & MSG_ERRQUEUE) return ip_recv_error(sk, msg, len, addr_len); try_again: off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, &off, &err); if (!skb) return err; ulen = udp_skb_len(skb); copied = len; if (copied > ulen - off) copied = ulen - off; else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; /* * If checksum is needed at all, try to do it while copying the * data. If the data is truncated, or if we only want a partial * coverage checksum (UDP-Lite), do it before the copy. */ if (copied < ulen || peeking || (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { checksum_valid = udp_skb_csum_unnecessary(skb) || !__udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; } if (checksum_valid || udp_skb_csum_unnecessary(skb)) { if (udp_skb_is_linear(skb)) err = copy_linear_skb(skb, copied, off, &msg->msg_iter); else err = skb_copy_datagram_msg(skb, off, msg, copied); } else { err = skb_copy_and_csum_datagram_msg(skb, off, msg); if (err == -EINVAL) goto csum_copy_err; } if (unlikely(err)) { if (!peeking) { udp_drops_inc(sk); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } kfree_skb(skb); return err; } if (!peeking) UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); sock_recv_cmsgs(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_port = udp_hdr(skb)->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, (struct sockaddr *)sin, addr_len); } if (udp_test_bit(GRO_ENABLED, sk)) udp_cmsg_recv(msg, sk, skb); if (inet_cmsg_flags(inet)) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); err = copied; if (flags & MSG_TRUNC) err = ulen; skb_consume_udp(sk, skb, peeking ? -err : err); return err; csum_copy_err: if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags, udp_skb_destructor)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); /* starting over for a new packet, but check if we need to yield */ cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from __ip4_datagram_connect() and * intended to prevent BPF program called below from accessing bytes * that are out of the bound specified by user in addr_len. */ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); } EXPORT_IPV6_MOD(udp_pre_connect); static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip4_datagram_connect(sk, uaddr, addr_len); if (!res) udp4_hash4(sk); release_sock(sk); return res; } int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); /* * 1003.1g - break association. */ sk->sk_state = TCP_CLOSE; inet->inet_daddr = 0; inet->inet_dport = 0; sock_rps_reset_rxhash(sk); sk->sk_bound_dev_if = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) { inet_reset_saddr(sk); if (sk->sk_prot->rehash && (sk->sk_userlocks & SOCK_BINDPORT_LOCK)) sk->sk_prot->rehash(sk); } if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { sk->sk_prot->unhash(sk); inet->inet_sport = 0; } sk_dst_reset(sk); return 0; } EXPORT_SYMBOL(__udp_disconnect); int udp_disconnect(struct sock *sk, int flags) { lock_sock(sk); __udp_disconnect(sk, flags); release_sock(sk); return 0; } EXPORT_IPV6_MOD(udp_disconnect); void udp_lib_unhash(struct sock *sk) { if (sk_hashed(sk)) { struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; sock_rps_delete_flow(sk); hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); if (sk_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_lock(&hslot2->lock); hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); udp_unhash4(udptable, sk); } spin_unlock_bh(&hslot->lock); } } EXPORT_IPV6_MOD(udp_lib_unhash); /* * inet_rcv_saddr was changed, we must rehash secondary hash */ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) { if (sk_hashed(sk)) { struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2, *nhslot2; hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); udp_sk(sk)->udp_portaddr_hash = newhash; if (hslot2 != nhslot2 || rcu_access_pointer(sk->sk_reuseport_cb)) { /* we must lock primary chain too */ spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); if (hslot2 != nhslot2) { spin_lock(&hslot2->lock); hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); spin_lock(&nhslot2->lock); hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &nhslot2->head); nhslot2->count++; spin_unlock(&nhslot2->lock); } spin_unlock_bh(&hslot->lock); } /* Now process hash4 if necessary: * (1) update hslot4; * (2) update hslot2->hash4_cnt. * Note that hslot2/hslot4 should be checked separately, as * either of them may change with the other unchanged. */ if (udp_hashed4(sk)) { spin_lock_bh(&hslot->lock); udp_rehash4(udptable, sk, newhash4); if (hslot2 != nhslot2) { spin_lock(&hslot2->lock); udp_hash4_dec(hslot2); spin_unlock(&hslot2->lock); spin_lock(&nhslot2->lock); udp_hash4_inc(nhslot2); spin_unlock(&nhslot2->lock); } spin_unlock_bh(&hslot->lock); } } } EXPORT_IPV6_MOD(udp_lib_rehash); void udp_v4_rehash(struct sock *sk) { u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); u16 new_hash4 = udp_ehashfn(sock_net(sk), sk->sk_rcv_saddr, sk->sk_num, sk->sk_daddr, sk->sk_dport); udp_lib_rehash(sk, new_hash, new_hash4); } static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; if (inet_sk(sk)->inet_daddr) { sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } else { sk_mark_napi_id_once(sk, skb); } rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); int drop_reason; /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) { UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; } else { UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS, is_udplite); drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); trace_udp_fail_queue_rcv_skb(rc, sk, skb); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } return 0; } /* returns: * -1: error * 0: success * >0: "udp encap" protocol resubmission * * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); /* * Charge it to the socket, dropping if the queue is full. */ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto drop; } nf_reset_ct(skb); if (static_branch_unlikely(&udp_encap_needed_key) && READ_ONCE(up->encap_type)) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); /* * This is an encapsulation socket so pass the skb to * the socket's udp_encap_rcv() hook. Otherwise, just * fall through and pass this up the UDP socket. * up->encap_rcv() returns the following value: * =0 if skb was successfully passed to the encap * handler or was discarded by it. * >0 if skb should be passed on to UDP. * <0 if skb should be resubmitted as proto -N */ /* if we're overly short, let UDP handle it */ encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret; /* Verify checksum before giving to encap */ if (udp_lib_checksum_complete(skb)) goto csum_error; ret = encap_rcv(sk, skb); if (ret <= 0) { __UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); return -ret; } } /* FALLTHROUGH -- it's a UDP Packet */ } /* * UDP-Lite specific tests, ignored on UDP sockets */ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) { u16 pcrlen = READ_ONCE(up->pcrlen); /* * MIB statistics other than incrementing the error count are * disabled for the following two types of errors: these depend * on the application settings, not on the functioning of the * protocol stack as such. * * RFC 3828 here recommends (sec 3.3): "There should also be a * way ... to ... at least let the receiving application block * delivery of packets with coverage values less than a value * provided by the application." */ if (pcrlen == 0) { /* full coverage was set */ net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n", UDP_SKB_CB(skb)->cscov, skb->len); goto drop; } /* The next case involves violating the min. coverage requested * by the receiver. This is subtle: if receiver wants x and x is * greater than the buffersize/MTU then receiver will complain * that it wants x while sender emits packets of smaller size y. * Therefore the above ...()->partial_cov statement is essential. */ if (UDP_SKB_CB(skb)->cscov < pcrlen) { net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n", UDP_SKB_CB(skb)->cscov, pcrlen); goto drop; } } prefetch(&sk->sk_rmem_alloc); if (rcu_access_pointer(sk->sk_filter) && udp_lib_checksum_complete(skb)) goto csum_error; if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) goto drop; udp_csum_pull_header(skb); ipv4_pktinfo_prepare(sk, skb, true); return __udp_queue_rcv_skb(sk, skb); csum_error: drop_reason = SKB_DROP_REASON_UDP_CSUM; __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff *next, *segs; int ret; if (likely(!udp_unexpected_gso(sk, skb))) return udp_queue_rcv_one_skb(sk, skb); BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET); __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, true); skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); udp_post_segment_fix_csum(skb); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret); } return 0; } /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; if (dst_hold_safe(dst)) { old = unrcu_pointer(xchg(&sk->sk_rx_dst, RCU_INITIALIZER(dst))); dst_release(old); return old != dst; } return false; } EXPORT_IPV6_MOD(udp_sk_rx_dst_set); /* * Multicasts and broadcasts go to each listener. * * Note: called only from the BH handler context. */ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, struct udp_table *udptable, int proto) { struct sock *sk, *first = NULL; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); unsigned int offset = offsetof(typeof(*sk), sk_node); int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; if (use_hash2) { hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & udptable->mask; hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2].hslot; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr, uh->source, saddr, dif, sdif, hnum)) continue; if (!first) { first = sk; continue; } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { udp_drops_inc(sk); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, IS_UDPLITE(sk)); continue; } if (udp_queue_rcv_skb(sk, nskb) > 0) consume_skb(nskb); } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { hash2 = hash2_any; goto start_lookup; } if (first) { if (udp_queue_rcv_skb(first, skb) > 0) consume_skb(skb); } else { kfree_skb(skb); __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI, proto == IPPROTO_UDPLITE); } return 0; } /* Initialize UDP checksum. If exited with zero value (success), * CHECKSUM_UNNECESSARY means, that no more checks are required. * Otherwise, csum completion requires checksumming packet body, * including udp header and folding it to skb->csum. */ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { int err; UDP_SKB_CB(skb)->partial_cov = 0; UDP_SKB_CB(skb)->cscov = skb->len; if (proto == IPPROTO_UDPLITE) { err = udplite_checksum_init(skb, uh); if (err) return err; if (UDP_SKB_CB(skb)->partial_cov) { skb->csum = inet_compute_pseudo(skb, proto); return 0; } } /* Note, we are only interested in != 0 or == 0, thus the * force to int. */ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, inet_compute_pseudo); if (err) return err; if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { /* If SW calculated the value, we know it's bad */ if (skb->csum_complete_sw) return 1; /* HW says the value is bad. Let's validate that. * skb->csum is no longer the full packet checksum, * so don't treat it as such. */ skb_checksum_complete_unset(skb); } return 0; } /* wrapper for udp_queue_rcv_skb taking care of csum conversion and * return code conversion for ip layer consumption */ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, struct udphdr *uh) { int ret; if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo); ret = udp_queue_rcv_skb(sk, skb); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 */ if (ret > 0) return -ret; return 0; } /* * All we need to do is get the socket, and then do a checksum. */ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { struct sock *sk = NULL; struct udphdr *uh; unsigned short ulen; struct rtable *rt = skb_rtable(skb); __be32 saddr, daddr; struct net *net = dev_net(skb->dev); bool refcounted; int drop_reason; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; /* * Validate the packet. */ if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto drop; /* No space for header. */ uh = udp_hdr(skb); ulen = ntohs(uh->len); saddr = ip_hdr(skb)->saddr; daddr = ip_hdr(skb)->daddr; if (ulen > skb->len) goto short_packet; if (proto == IPPROTO_UDP) { /* UDP validates ulen. */ if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen)) goto short_packet; uh = udp_hdr(skb); } if (udp4_csum_init(skb, uh, proto)) goto csum_error; sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, &refcounted, udp_ehashfn); if (IS_ERR(sk)) goto no_sk; if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp_sk_rx_dst_set(sk, dst); ret = udp_unicast_rcv_skb(sk, skb, uh); if (refcounted) sock_put(sk); return ret; } if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) return udp_unicast_rcv_skb(sk, skb, uh); no_sk: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; nf_reset_ct(skb); /* No socket. Drop packet silently, if checksum is wrong */ if (udp_lib_checksum_complete(skb)) goto csum_error; drop_reason = SKB_DROP_REASON_NO_SOCKET; __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* * Hmm. We got an UDP packet to a port to which we * don't wanna listen. Ignore it. */ sk_skb_reason_drop(sk, skb, drop_reason); return 0; short_packet: drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", proto == IPPROTO_UDPLITE ? "Lite" : "", &saddr, ntohs(uh->source), ulen, skb->len, &daddr, ntohs(uh->dest)); goto drop; csum_error: /* * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ drop_reason = SKB_DROP_REASON_UDP_CSUM; net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", proto == IPPROTO_UDPLITE ? "Lite" : "", &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), ulen); __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); sk_skb_reason_drop(sk, skb, drop_reason); return 0; } /* We can only early demux multicast if there is a single matching socket. * If more than one socket found returns NULL */ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif) { struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(loc_port); struct sock *sk, *result; struct udp_hslot *hslot; unsigned int slot; slot = udp_hashfn(net, hnum, udptable->mask); hslot = &udptable->hash[slot]; /* Do not bother scanning a too big list */ if (hslot->count > 10) return NULL; result = NULL; sk_for_each_rcu(sk, &hslot->head) { if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr, rmt_port, rmt_addr, dif, sdif, hnum)) { if (result) return NULL; result = sk; } } return result; } /* For unicast we should only early demux connected sockets or we can * break forwarding setups. The chains here can be long so only check * if the first socket is an exact match and if not move on. */ static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif) { struct udp_table *udptable = net->ipv4.udp_table; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); unsigned short hnum = ntohs(loc_port); struct udp_hslot *hslot2; unsigned int hash2; __portpair ports; struct sock *sk; hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); hslot2 = udp_hashslot2(udptable, hash2); ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (inet_match(net, sk, acookie, ports, dif, sdif)) return sk; /* Only check first socket in chain */ break; } return NULL; } enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk = NULL; struct dst_entry *dst; int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); int ours; /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) return SKB_NOT_DROPPED_YET; iph = ip_hdr(skb); uh = udp_hdr(skb); if (skb->pkt_type == PACKET_MULTICAST) { in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) return SKB_NOT_DROPPED_YET; ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) return SKB_NOT_DROPPED_YET; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif, sdif); } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif, sdif); } if (!sk) return SKB_NOT_DROPPED_YET; skb->sk = sk; DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); skb->destructor = sock_pfree; dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); if (dst) { u32 itag = 0; /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); /* for unconnected multicast sockets we need to validate * the source on each packet */ if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), skb->dev, in_dev, &itag); } return SKB_NOT_DROPPED_YET; } int udp_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP); } void udp_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); /* protects from races with udp_abort() */ sock_set_flag(sk, SOCK_DEAD); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); if (static_branch_unlikely(&udp_encap_needed_key)) { if (up->encap_type) { void (*encap_destroy)(struct sock *sk); encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) encap_destroy(sk); } if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udp_encap_needed_key); udp_tunnel_cleanup_gro(sk); } } } typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk, struct list_head *head, struct sk_buff *skb); static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family, struct sock *sk) { #ifdef CONFIG_XFRM udp_gro_receive_t new_gro_receive; if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) { if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv; else new_gro_receive = xfrm4_gro_udp_encap_rcv; if (udp_sk(sk)->gro_receive != new_gro_receive) { /* * With IPV6_ADDRFORM the gro callback could change * after being set, unregister the old one, if valid. */ if (udp_sk(sk)->gro_receive) udp_tunnel_update_gro_rcv(sk, false); WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive); udp_tunnel_update_gro_rcv(sk, true); } } #endif } /* * Socket option code for UDP */ int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); int val, valbool; int err = 0; int is_udplite = IS_UDPLITE(sk); if (level == SOL_SOCKET) { err = sk_setsockopt(sk, level, optname, optval, optlen); if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) { sockopt_lock_sock(sk); /* paired with READ_ONCE in udp_rmem_release() */ WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2); sockopt_release_sock(sk); } return err; } if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; switch (optname) { case UDP_CORK: if (val != 0) { udp_set_bit(CORK, sk); } else { udp_clear_bit(CORK, sk); lock_sock(sk); push_pending_frames(sk); release_sock(sk); } break; case UDP_ENCAP: sockopt_lock_sock(sk); switch (val) { case 0: #ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) WRITE_ONCE(up->encap_rcv, ipv6_stub->xfrm6_udp_encap_rcv); else #endif WRITE_ONCE(up->encap_rcv, xfrm4_udp_encap_rcv); #endif fallthrough; case UDP_ENCAP_L2TPINUDP: WRITE_ONCE(up->encap_type, val); udp_tunnel_encap_enable(sk); break; default: err = -ENOPROTOOPT; break; } sockopt_release_sock(sk); break; case UDP_NO_CHECK6_TX: udp_set_no_check6_tx(sk, valbool); break; case UDP_NO_CHECK6_RX: udp_set_no_check6_rx(sk, valbool); break; case UDP_SEGMENT: if (val < 0 || val > USHRT_MAX) return -EINVAL; WRITE_ONCE(up->gso_size, val); break; case UDP_GRO: sockopt_lock_sock(sk); /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk); udp_assign_bit(GRO_ENABLED, sk, valbool); udp_assign_bit(ACCEPT_L4, sk, valbool); set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk); sockopt_release_sock(sk); break; /* * UDP-Lite's partial checksum coverage (RFC 3828). */ /* The sender sets actual checksum coverage length via this option. * The case coverage > packet length is handled by send module. */ case UDPLITE_SEND_CSCOV: if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ val = 8; else if (val > USHRT_MAX) val = USHRT_MAX; WRITE_ONCE(up->pcslen, val); udp_set_bit(UDPLITE_SEND_CC, sk); break; /* The receiver specifies a minimum checksum coverage value. To make * sense, this should be set to at least 8 (as done below). If zero is * used, this again means full checksum coverage. */ case UDPLITE_RECV_CSCOV: if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Avoid silly minimal values. */ val = 8; else if (val > USHRT_MAX) val = USHRT_MAX; WRITE_ONCE(up->pcrlen, val); udp_set_bit(UDPLITE_RECV_CC, sk); break; default: err = -ENOPROTOOPT; break; } return err; } EXPORT_IPV6_MOD(udp_lib_setsockopt); int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_push_pending_frames); return ip_setsockopt(sk, level, optname, optval, optlen); } int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct udp_sock *up = udp_sk(sk); int val, len; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case UDP_CORK: val = udp_test_bit(CORK, sk); break; case UDP_ENCAP: val = READ_ONCE(up->encap_type); break; case UDP_NO_CHECK6_TX: val = udp_get_no_check6_tx(sk); break; case UDP_NO_CHECK6_RX: val = udp_get_no_check6_rx(sk); break; case UDP_SEGMENT: val = READ_ONCE(up->gso_size); break; case UDP_GRO: val = udp_test_bit(GRO_ENABLED, sk); break; /* The following two cannot be changed on UDP sockets, the return is * always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV: val = READ_ONCE(up->pcslen); break; case UDPLITE_RECV_CSCOV: val = READ_ONCE(up->pcrlen); break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } EXPORT_IPV6_MOD(udp_lib_getsockopt); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_getsockopt(sk, level, optname, optval, optlen); return ip_getsockopt(sk, level, optname, optval, optlen); } /** * udp_poll - wait for a UDP event. * @file: - file struct * @sock: - socket * @wait: - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd * and a packet with checksum error is in the queue; * then it could get return from select indicating data available * but then block when reading it. Add special case code * to work around these arguably broken applications. */ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) { __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) mask &= ~(EPOLLIN | EPOLLRDNORM); /* psock ingress_msg queue should not contain any bad checksum frames */ if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } EXPORT_IPV6_MOD(udp_poll); int udp_abort(struct sock *sk, int err) { if (!has_current_bpf_ctx()) lock_sock(sk); /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing * with close() */ if (sock_flag(sk, SOCK_DEAD)) goto out; sk->sk_err = err; sk_error_report(sk); __udp_disconnect(sk, 0); out: if (!has_current_bpf_ctx()) release_sock(sk); return 0; } EXPORT_IPV6_MOD_GPL(udp_abort); struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udp_pre_connect, .connect = udp_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udp_init_sock, .destroy = udp_destroy_sock, .setsockopt = udp_setsockopt, .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .splice_eof = udp_splice_eof, .release_cb = ip4_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .put_port = udp_lib_unhash, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = udp_bpf_update_proto, #endif .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = NULL, .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS static unsigned short seq_file_family(const struct seq_file *seq); static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) { unsigned short family = seq_file_family(seq); /* AF_UNSPEC is used as a match all */ return ((family == AF_UNSPEC || family == sk->sk_family) && net_eq(sock_net(sk), seq_file_net(seq))); } #ifdef CONFIG_BPF_SYSCALL static const struct seq_operations bpf_iter_udp_seq_ops; #endif static struct udp_table *udp_get_table_seq(struct seq_file *seq, struct net *net) { const struct udp_seq_afinfo *afinfo; #ifdef CONFIG_BPF_SYSCALL if (seq->op == &bpf_iter_udp_seq_ops) return net->ipv4.udp_table; #endif afinfo = pde_data(file_inode(seq->file)); return afinfo->udp_table ? : net->ipv4.udp_table; } static struct sock *udp_get_first(struct seq_file *seq, int start) { struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); struct udp_table *udptable; struct sock *sk; udptable = udp_get_table_seq(seq, net); for (state->bucket = start; state->bucket <= udptable->mask; ++state->bucket) { struct udp_hslot *hslot = &udptable->hash[state->bucket]; if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { if (seq_sk_match(seq, sk)) goto found; } spin_unlock_bh(&hslot->lock); } sk = NULL; found: return sk; } static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) { struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); struct udp_table *udptable; do { sk = sk_next(sk); } while (sk && !seq_sk_match(seq, sk)); if (!sk) { udptable = udp_get_table_seq(seq, net); if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); return udp_get_first(seq, state->bucket + 1); } return sk; } static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = udp_get_first(seq, 0); if (sk) while (pos && (sk = udp_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } void *udp_seq_start(struct seq_file *seq, loff_t *pos) { struct udp_iter_state *state = seq->private; state->bucket = MAX_UDP_PORTS; return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } EXPORT_IPV6_MOD(udp_seq_start); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = udp_get_idx(seq, 0); else sk = udp_get_next(seq, v); ++*pos; return sk; } EXPORT_IPV6_MOD(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { struct udp_iter_state *state = seq->private; struct udp_table *udptable; udptable = udp_get_table_seq(seq, seq_file_net(seq)); if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); } EXPORT_IPV6_MOD(udp_seq_stop); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), udp_rqueue_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp)); } int udp4_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct udp_iter_state *state = seq->private; udp4_format_sock(v, seq, state->bucket); } seq_pad(seq, '\n'); return 0; } #ifdef CONFIG_BPF_SYSCALL struct bpf_iter__udp { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct udp_sock *, udp_sk); uid_t uid __aligned(8); int bucket __aligned(8); }; union bpf_udp_iter_batch_item { struct sock *sk; __u64 cookie; }; struct bpf_udp_iter_state { struct udp_iter_state state; unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; union bpf_udp_iter_batch_item *batch; }; static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, unsigned int new_batch_sz, gfp_t flags); static struct sock *bpf_iter_udp_resume(struct sock *first_sk, union bpf_udp_iter_batch_item *cookies, int n_cookies) { struct sock *sk = NULL; int i; for (i = 0; i < n_cookies; i++) { sk = first_sk; udp_portaddr_for_each_entry_from(sk) if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) goto done; } done: return sk; } static struct sock *bpf_iter_udp_batch(struct seq_file *seq) { struct bpf_udp_iter_state *iter = seq->private; struct udp_iter_state *state = &iter->state; unsigned int find_cookie, end_cookie; struct net *net = seq_file_net(seq); struct udp_table *udptable; unsigned int batch_sks = 0; int resume_bucket; int resizes = 0; struct sock *sk; int err = 0; resume_bucket = state->bucket; /* The current batch is done, so advance the bucket. */ if (iter->cur_sk == iter->end_sk) state->bucket++; udptable = udp_get_table_seq(seq, net); again: /* New batch for the next bucket. * Iterate over the hash table to find a bucket with sockets matching * the iterator attributes, and return the first matching socket from * the bucket. The remaining matched sockets from the bucket are batched * before releasing the bucket lock. This allows BPF programs that are * called in seq_show to acquire the bucket lock if needed. */ find_cookie = iter->cur_sk; end_cookie = iter->end_sk; iter->cur_sk = 0; iter->end_sk = 0; batch_sks = 0; for (; state->bucket <= udptable->mask; state->bucket++) { struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot; if (hlist_empty(&hslot2->head)) goto next_bucket; spin_lock_bh(&hslot2->lock); sk = hlist_entry_safe(hslot2->head.first, struct sock, __sk_common.skc_portaddr_node); /* Resume from the first (in iteration order) unseen socket from * the last batch that still exists in resume_bucket. Most of * the time this will just be where the last iteration left off * in resume_bucket unless that socket disappeared between * reads. */ if (state->bucket == resume_bucket) sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie], end_cookie - find_cookie); fill_batch: udp_portaddr_for_each_entry_from(sk) { if (seq_sk_match(seq, sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++].sk = sk; } batch_sks++; } } /* Allocate a larger batch and try again. */ if (unlikely(resizes <= 1 && iter->end_sk && iter->end_sk != batch_sks)) { resizes++; /* First, try with GFP_USER to maximize the chances of * grabbing more memory. */ if (resizes == 1) { spin_unlock_bh(&hslot2->lock); err = bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2, GFP_USER); if (err) return ERR_PTR(err); /* Start over. */ goto again; } /* Next, hold onto the lock, so the bucket doesn't * change while we get the rest of the sockets. */ err = bpf_iter_udp_realloc_batch(iter, batch_sks, GFP_NOWAIT); if (err) { spin_unlock_bh(&hslot2->lock); return ERR_PTR(err); } /* Pick up where we left off. */ sk = iter->batch[iter->end_sk - 1].sk; sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next, struct sock, __sk_common.skc_portaddr_node); batch_sks = iter->end_sk; goto fill_batch; } spin_unlock_bh(&hslot2->lock); if (iter->end_sk) break; next_bucket: resizes = 0; } WARN_ON_ONCE(iter->end_sk != batch_sks); return iter->end_sk ? iter->batch[0].sk : NULL; } static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_udp_iter_state *iter = seq->private; struct sock *sk; /* Whenever seq_next() is called, the iter->cur_sk is * done with seq_show(), so unref the iter->cur_sk. */ if (iter->cur_sk < iter->end_sk) sock_put(iter->batch[iter->cur_sk++].sk); /* After updating iter->cur_sk, check if there are more sockets * available in the current bucket batch. */ if (iter->cur_sk < iter->end_sk) sk = iter->batch[iter->cur_sk].sk; else /* Prepare a new batch. */ sk = bpf_iter_udp_batch(seq); ++*pos; return sk; } static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos) { /* bpf iter does not support lseek, so it always * continue from where it was stop()-ped. */ if (*pos) return bpf_iter_udp_batch(seq); return SEQ_START_TOKEN; } static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) { struct bpf_iter__udp ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.udp_sk = udp_sk; ctx.uid = uid; ctx.bucket = bucket; return bpf_iter_run_prog(prog, &ctx); } static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v) { struct udp_iter_state *state = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; uid_t uid; int ret; if (v == SEQ_START_TOKEN) return 0; lock_sock(sk); if (unlikely(sk_unhashed(sk))) { ret = SEQ_SKIP; goto unlock; } uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket); unlock: release_sock(sk); return ret; } static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter) { union bpf_udp_iter_batch_item *item; unsigned int cur_sk = iter->cur_sk; __u64 cookie; /* Remember the cookies of the sockets we haven't seen yet, so we can * pick up where we left off next time around. */ while (cur_sk < iter->end_sk) { item = &iter->batch[cur_sk++]; cookie = sock_gen_cookie(item->sk); sock_put(item->sk); item->cookie = cookie; } } static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) { struct bpf_udp_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)udp_prog_seq_show(prog, &meta, v, 0, 0); } if (iter->cur_sk < iter->end_sk) bpf_iter_udp_put_batch(iter); } static const struct seq_operations bpf_iter_udp_seq_ops = { .start = bpf_iter_udp_seq_start, .next = bpf_iter_udp_seq_next, .stop = bpf_iter_udp_seq_stop, .show = bpf_iter_udp_seq_show, }; #endif static unsigned short seq_file_family(const struct seq_file *seq) { const struct udp_seq_afinfo *afinfo; #ifdef CONFIG_BPF_SYSCALL /* BPF iterator: bpf programs to filter sockets. */ if (seq->op == &bpf_iter_udp_seq_ops) return AF_UNSPEC; #endif /* Proc fs iterator */ afinfo = pde_data(file_inode(seq->file)); return afinfo->family; } const struct seq_operations udp_seq_ops = { .start = udp_seq_start, .next = udp_seq_next, .stop = udp_seq_stop, .show = udp4_seq_show, }; EXPORT_IPV6_MOD(udp_seq_ops); static struct udp_seq_afinfo udp4_seq_afinfo = { .family = AF_INET, .udp_table = NULL, }; static int __net_init udp4_proc_init_net(struct net *net) { if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops, sizeof(struct udp_iter_state), &udp4_seq_afinfo)) return -ENOMEM; return 0; } static void __net_exit udp4_proc_exit_net(struct net *net) { remove_proc_entry("udp", net->proc_net); } static struct pernet_operations udp4_net_ops = { .init = udp4_proc_init_net, .exit = udp4_proc_exit_net, }; int __init udp4_proc_init(void) { return register_pernet_subsys(&udp4_net_ops); } void udp4_proc_exit(void) { unregister_pernet_subsys(&udp4_net_ops); } #endif /* CONFIG_PROC_FS */ static __initdata unsigned long uhash_entries; static int __init set_uhash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtoul(str, 0, &uhash_entries); if (ret) return 0; if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) uhash_entries = UDP_HTABLE_SIZE_MIN; return 1; } __setup("uhash_entries=", set_uhash_entries); void __init udp_table_init(struct udp_table *table, const char *name) { unsigned int i, slot_size; slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) + udp_hash4_slot_size(); table->hash = alloc_large_system_hash(name, slot_size, uhash_entries, 21, /* one slot per 2 MB */ 0, &table->log, &table->mask, UDP_HTABLE_SIZE_MIN, UDP_HTABLE_SIZE_MAX); table->hash2 = (void *)(table->hash + (table->mask + 1)); for (i = 0; i <= table->mask; i++) { INIT_HLIST_HEAD(&table->hash[i].head); table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } for (i = 0; i <= table->mask; i++) { INIT_HLIST_HEAD(&table->hash2[i].hslot.head); table->hash2[i].hslot.count = 0; spin_lock_init(&table->hash2[i].hslot.lock); } udp_table_hash4_init(table); } u32 udp_flow_hashrnd(void) { static u32 hashrnd __read_mostly; net_get_random_once(&hashrnd, sizeof(hashrnd)); return hashrnd; } EXPORT_SYMBOL(udp_flow_hashrnd); static void __net_init udp_sysctl_init(struct net *net) { net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE; net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE; #ifdef CONFIG_NET_L3_MASTER_DEV net->ipv4.sysctl_udp_l3mdev_accept = 0; #endif } static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries) { struct udp_table *udptable; unsigned int slot_size; int i; udptable = kmalloc(sizeof(*udptable), GFP_KERNEL); if (!udptable) goto out; slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) + udp_hash4_slot_size(); udptable->hash = vmalloc_huge(hash_entries * slot_size, GFP_KERNEL_ACCOUNT); if (!udptable->hash) goto free_table; udptable->hash2 = (void *)(udptable->hash + hash_entries); udptable->mask = hash_entries - 1; udptable->log = ilog2(hash_entries); for (i = 0; i < hash_entries; i++) { INIT_HLIST_HEAD(&udptable->hash[i].head); udptable->hash[i].count = 0; spin_lock_init(&udptable->hash[i].lock); INIT_HLIST_HEAD(&udptable->hash2[i].hslot.head); udptable->hash2[i].hslot.count = 0; spin_lock_init(&udptable->hash2[i].hslot.lock); } udp_table_hash4_init(udptable); return udptable; free_table: kfree(udptable); out: return NULL; } static void __net_exit udp_pernet_table_free(struct net *net) { struct udp_table *udptable = net->ipv4.udp_table; if (udptable == &udp_table) return; kvfree(udptable->hash); kfree(udptable); } static void __net_init udp_set_table(struct net *net) { struct udp_table *udptable; unsigned int hash_entries; struct net *old_net; if (net_eq(net, &init_net)) goto fallback; old_net = current->nsproxy->net_ns; hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries); if (!hash_entries) goto fallback; /* Set min to keep the bitmap on stack in udp_lib_get_port() */ if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET) hash_entries = UDP_HTABLE_SIZE_MIN_PERNET; else hash_entries = roundup_pow_of_two(hash_entries); udptable = udp_pernet_table_alloc(hash_entries); if (udptable) { net->ipv4.udp_table = udptable; } else { pr_warn("Failed to allocate UDP hash table (entries: %u) " "for a netns, fallback to the global one\n", hash_entries); fallback: net->ipv4.udp_table = &udp_table; } } static int __net_init udp_pernet_init(struct net *net) { #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) int i; /* No tunnel is configured */ for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); } #endif udp_sysctl_init(net); udp_set_table(net); return 0; } static void __net_exit udp_pernet_exit(struct net *net) { udp_pernet_table_free(net); } static struct pernet_operations __net_initdata udp_sysctl_ops = { .init = udp_pernet_init, .exit = udp_pernet_exit, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, unsigned int new_batch_sz, gfp_t flags) { union bpf_udp_iter_batch_item *new_batch; new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch), flags | __GFP_NOWARN); if (!new_batch) return -ENOMEM; if (flags != GFP_NOWAIT) bpf_iter_udp_put_batch(iter); memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; return 0; } #define INIT_BATCH_SZ 16 static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_udp_iter_state *iter = priv_data; int ret; ret = bpf_iter_init_seq_net(priv_data, aux); if (ret) return ret; ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); if (ret) bpf_iter_fini_seq_net(priv_data); iter->state.bucket = -1; return ret; } static void bpf_iter_fini_udp(void *priv_data) { struct bpf_udp_iter_state *iter = priv_data; bpf_iter_fini_seq_net(priv_data); kvfree(iter->batch); } static const struct bpf_iter_seq_info udp_seq_info = { .seq_ops = &bpf_iter_udp_seq_ops, .init_seq_private = bpf_iter_init_udp, .fini_seq_private = bpf_iter_fini_udp, .seq_priv_size = sizeof(struct bpf_udp_iter_state), }; static struct bpf_iter_reg udp_reg_info = { .target = "udp", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__udp, udp_sk), PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &udp_seq_info, }; static void __init bpf_iter_register(void) { udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP]; if (bpf_iter_reg_target(&udp_reg_info)) pr_warn("Warning: could not register bpf iterator udp\n"); } #endif void __init udp_init(void) { unsigned long limit; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); sysctl_udp_mem[0] = limit / 4 * 3; sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_register(); #endif }
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 // SPDX-License-Identifier: GPL-2.0 /* * Parts of this file are * Copyright (C) 2022-2023 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/cfg80211.h> #include "nl80211.h" #include "core.h" #include "rdev-ops.h" static int ___cfg80211_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, bool notify) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); if (!rdev->ops->stop_ap) return -EOPNOTSUPP; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; if (!wdev->links[link_id].ap.beacon_interval) return -ENOENT; err = rdev_stop_ap(rdev, dev, link_id); if (!err) { wdev->conn_owner_nlportid = 0; wdev->links[link_id].ap.beacon_interval = 0; memset(&wdev->links[link_id].ap.chandef, 0, sizeof(wdev->links[link_id].ap.chandef)); wdev->u.ap.ssid_len = 0; rdev_set_qos_map(rdev, dev, NULL); if (notify) nl80211_send_ap_stopped(wdev, link_id); /* Should we apply the grace period during beaconing interface * shutdown also? */ cfg80211_sched_dfs_chan_update(rdev); } schedule_work(&cfg80211_disconnect_work); return err; } int cfg80211_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, int link_id, bool notify) { unsigned int link; int ret = 0; if (link_id >= 0) return ___cfg80211_stop_ap(rdev, dev, link_id, notify); for_each_valid_link(dev->ieee80211_ptr, link) { int ret1 = ___cfg80211_stop_ap(rdev, dev, link, notify); if (ret1) ret = ret1; /* try the next one also if one errored */ } return ret; }
344 210 210 344 345 345 345 343 344 345 268 267 110 109 344 345 345 344 265 265 343 345 344 345 345 345 344 345 345 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2025 Christoph Hellwig */ #include <linux/blk-integrity.h> #include <linux/blk-mq-dma.h> #include "blk.h" struct phys_vec { phys_addr_t paddr; u32 len; }; static bool __blk_map_iter_next(struct blk_map_iter *iter) { if (iter->iter.bi_size) return true; if (!iter->bio || !iter->bio->bi_next) return false; iter->bio = iter->bio->bi_next; if (iter->is_integrity) { iter->iter = bio_integrity(iter->bio)->bip_iter; iter->bvecs = bio_integrity(iter->bio)->bip_vec; } else { iter->iter = iter->bio->bi_iter; iter->bvecs = iter->bio->bi_io_vec; } return true; } static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct phys_vec *vec) { unsigned int max_size; struct bio_vec bv; if (!iter->iter.bi_size) return false; bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter); vec->paddr = bvec_phys(&bv); max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); bv.bv_len = min(bv.bv_len, max_size); bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len); /* * If we are entirely done with this bi_io_vec entry, check if the next * one could be merged into it. This typically happens when moving to * the next bio, but some callers also don't pack bvecs tight. */ while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { struct bio_vec next; if (!__blk_map_iter_next(iter)) break; next = mp_bvec_iter_bvec(iter->bvecs, iter->iter); if (bv.bv_len + next.bv_len > max_size || !biovec_phys_mergeable(req->q, &bv, &next)) break; bv.bv_len += next.bv_len; bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len); } vec->len = bv.bv_len; return true; } /* * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so * we need to ensure our segments are aligned to this as well. * * Note that there is no point in using the slightly more complicated IOVA based * path for single segment mappings. */ static inline bool blk_can_dma_map_iova(struct request *req, struct device *dma_dev) { return !((queue_virt_boundary(req->q) + 1) & dma_get_merge_boundary(dma_dev)); } static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) { iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr); iter->len = vec->len; return true; } static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, struct blk_dma_iter *iter, struct phys_vec *vec) { iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr), offset_in_page(vec->paddr), vec->len, rq_dma_dir(req)); if (dma_mapping_error(dma_dev, iter->addr)) { iter->status = BLK_STS_RESOURCE; return false; } iter->len = vec->len; return true; } static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter, struct phys_vec *vec) { enum dma_data_direction dir = rq_dma_dir(req); unsigned int mapped = 0; int error; iter->addr = state->addr; iter->len = dma_iova_size(state); do { error = dma_iova_link(dma_dev, state, vec->paddr, mapped, vec->len, dir, 0); if (error) break; mapped += vec->len; } while (blk_map_iter_next(req, &iter->iter, vec)); error = dma_iova_sync(dma_dev, state, 0, mapped); if (error) { iter->status = errno_to_blk_status(error); return false; } return true; } static inline void blk_rq_map_iter_init(struct request *rq, struct blk_map_iter *iter) { struct bio *bio = rq->bio; if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { *iter = (struct blk_map_iter) { .bvecs = &rq->special_vec, .iter = { .bi_size = rq->special_vec.bv_len, } }; } else if (bio) { *iter = (struct blk_map_iter) { .bio = bio, .bvecs = bio->bi_io_vec, .iter = bio->bi_iter, }; } else { /* the internal flush request may not have bio attached */ *iter = (struct blk_map_iter) {}; } } static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter, unsigned int total_len) { struct phys_vec vec; memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); iter->status = BLK_STS_OK; /* * Grab the first segment ASAP because we'll need it to check for P2P * transfers. */ if (!blk_map_iter_next(req, &iter->iter, &vec)) return false; switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, phys_to_page(vec.paddr))) { case PCI_P2PDMA_MAP_BUS_ADDR: if (iter->iter.is_integrity) bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA; else req->cmd_flags |= REQ_P2PDMA; return blk_dma_map_bus(iter, &vec); case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: /* * P2P transfers through the host bridge are treated the * same as non-P2P transfers below and during unmap. */ case PCI_P2PDMA_MAP_NONE: break; default: iter->status = BLK_STS_INVAL; return false; } if (blk_can_dma_map_iova(req, dma_dev) && dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len)) return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); return blk_dma_map_direct(req, dma_dev, iter, &vec); } /** * blk_rq_dma_map_iter_start - map the first DMA segment for a request * @req: request to map * @dma_dev: device to map to * @state: DMA IOVA state * @iter: block layer DMA iterator * * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the * caller and don't need to be initialized. @state needs to be stored for use * at unmap time, @iter is only needed at map time. * * Returns %false if there is no segment to map, including due to an error, or * %true ft it did map a segment. * * If a segment was mapped, the DMA address for it is returned in @iter.addr and * the length in @iter.len. If no segment was mapped the status code is * returned in @iter.status. * * The caller can call blk_rq_dma_map_coalesce() to check if further segments * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() * to try to map the following segments. */ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter) { blk_rq_map_iter_init(req, &iter->iter); return blk_dma_map_iter_start(req, dma_dev, state, iter, blk_rq_payload_bytes(req)); } EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); /** * blk_rq_dma_map_iter_next - map the next DMA segment for a request * @req: request to map * @dma_dev: device to map to * @state: DMA IOVA state * @iter: block layer DMA iterator * * Iterate to the next mapping after a previous call to * blk_rq_dma_map_iter_start(). See there for a detailed description of the * arguments. * * Returns %false if there is no segment to map, including due to an error, or * %true ft it did map a segment. * * If a segment was mapped, the DMA address for it is returned in @iter.addr and * the length in @iter.len. If no segment was mapped the status code is * returned in @iter.status. */ bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter) { struct phys_vec vec; if (!blk_map_iter_next(req, &iter->iter, &vec)) return false; if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) return blk_dma_map_bus(iter, &vec); return blk_dma_map_direct(req, dma_dev, iter, &vec); } EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next); static inline struct scatterlist * blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) { if (!*sg) return sglist; /* * If the driver previously mapped a shorter list, we could see a * termination bit prematurely unless it fully inits the sg table * on each mapping. We KNOW that there must be more entries here * or the driver would be buggy, so force clear the termination bit * to avoid doing a full sg_init_table() in drivers for each command. */ sg_unmark_end(*sg); return sg_next(*sg); } /* * Map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries. */ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { struct blk_map_iter iter; struct phys_vec vec; int nsegs = 0; blk_rq_map_iter_init(rq, &iter); while (blk_map_iter_next(rq, &iter, &vec)) { *last_sg = blk_next_sg(last_sg, sglist); sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, offset_in_page(vec.paddr)); nsegs++; } if (*last_sg) sg_mark_end(*last_sg); /* * Something must have been wrong if the figured number of * segment is bigger than number of req's physical segments */ WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); return nsegs; } EXPORT_SYMBOL(__blk_rq_map_sg); #ifdef CONFIG_BLK_DEV_INTEGRITY /** * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment * for a request * @req: request to map * @dma_dev: device to map to * @state: DMA IOVA state * @iter: block layer DMA iterator * * Start DMA mapping @req integrity data to @dma_dev. @state and @iter are * provided by the caller and don't need to be initialized. @state needs to be * stored for use at unmap time, @iter is only needed at map time. * * Returns %false if there is no segment to map, including due to an error, or * %true if it did map a segment. * * If a segment was mapped, the DMA address for it is returned in @iter.addr * and the length in @iter.len. If no segment was mapped the status code is * returned in @iter.status. * * The caller can call blk_rq_dma_map_coalesce() to check if further segments * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() * to try to map the following segments. */ bool blk_rq_integrity_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter) { unsigned len = bio_integrity_bytes(&req->q->limits.integrity, blk_rq_sectors(req)); struct bio *bio = req->bio; iter->iter = (struct blk_map_iter) { .bio = bio, .iter = bio_integrity(bio)->bip_iter, .bvecs = bio_integrity(bio)->bip_vec, .is_integrity = true, }; return blk_dma_map_iter_start(req, dma_dev, state, iter, len); } EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start); /** * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for * a request * @req: request to map * @dma_dev: device to map to * @state: DMA IOVA state * @iter: block layer DMA iterator * * Iterate to the next integrity mapping after a previous call to * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description * of the arguments. * * Returns %false if there is no segment to map, including due to an error, or * %true if it did map a segment. * * If a segment was mapped, the DMA address for it is returned in @iter.addr and * the length in @iter.len. If no segment was mapped the status code is * returned in @iter.status. */ bool blk_rq_integrity_dma_map_iter_next(struct request *req, struct device *dma_dev, struct blk_dma_iter *iter) { struct phys_vec vec; if (!blk_map_iter_next(req, &iter->iter, &vec)) return false; if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) return blk_dma_map_bus(iter, &vec); return blk_dma_map_direct(req, dma_dev, iter, &vec); } EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next); /** * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist * @rq: request to map * @sglist: target scatterlist * * Description: Map the integrity vectors in request into a * scatterlist. The scatterlist must be big enough to hold all * elements. I.e. sized using blk_rq_count_integrity_sg() or * rq->nr_integrity_segments. */ int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) { struct request_queue *q = rq->q; struct scatterlist *sg = NULL; struct bio *bio = rq->bio; unsigned int segments = 0; struct phys_vec vec; struct blk_map_iter iter = { .bio = bio, .iter = bio_integrity(bio)->bip_iter, .bvecs = bio_integrity(bio)->bip_vec, .is_integrity = true, }; while (blk_map_iter_next(rq, &iter, &vec)) { sg = blk_next_sg(&sg, sglist); sg_set_page(sg, phys_to_page(vec.paddr), vec.len, offset_in_page(vec.paddr)); segments++; } if (sg) sg_mark_end(sg); /* * Something must have been wrong if the figured number of segment * is bigger than number of req's physical integrity segments */ BUG_ON(segments > rq->nr_integrity_segments); BUG_ON(segments > queue_max_integrity_segments(q)); return segments; } EXPORT_SYMBOL(blk_rq_map_integrity_sg); #endif
2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #include <linux/iommu.h> #include <uapi/linux/iommufd.h> #include "../iommu-priv.h" #include "iommufd_private.h" static void __iommufd_hwpt_destroy(struct iommufd_hw_pagetable *hwpt) { if (hwpt->domain) iommu_domain_free(hwpt->domain); if (hwpt->fault) refcount_dec(&hwpt->fault->common.obj.users); } void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) { struct iommufd_hwpt_paging *hwpt_paging = container_of(obj, struct iommufd_hwpt_paging, common.obj); if (!list_empty(&hwpt_paging->hwpt_item)) { mutex_lock(&hwpt_paging->ioas->mutex); list_del(&hwpt_paging->hwpt_item); mutex_unlock(&hwpt_paging->ioas->mutex); iopt_table_remove_domain(&hwpt_paging->ioas->iopt, hwpt_paging->common.domain); } __iommufd_hwpt_destroy(&hwpt_paging->common); refcount_dec(&hwpt_paging->ioas->obj.users); } void iommufd_hwpt_paging_abort(struct iommufd_object *obj) { struct iommufd_hwpt_paging *hwpt_paging = container_of(obj, struct iommufd_hwpt_paging, common.obj); /* The ioas->mutex must be held until finalize is called. */ lockdep_assert_held(&hwpt_paging->ioas->mutex); if (!list_empty(&hwpt_paging->hwpt_item)) { list_del_init(&hwpt_paging->hwpt_item); iopt_table_remove_domain(&hwpt_paging->ioas->iopt, hwpt_paging->common.domain); } iommufd_hwpt_paging_destroy(obj); } void iommufd_hwpt_nested_destroy(struct iommufd_object *obj) { struct iommufd_hwpt_nested *hwpt_nested = container_of(obj, struct iommufd_hwpt_nested, common.obj); __iommufd_hwpt_destroy(&hwpt_nested->common); if (hwpt_nested->viommu) refcount_dec(&hwpt_nested->viommu->obj.users); else refcount_dec(&hwpt_nested->parent->common.obj.users); } void iommufd_hwpt_nested_abort(struct iommufd_object *obj) { iommufd_hwpt_nested_destroy(obj); } static int iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) { struct iommu_domain *paging_domain = hwpt_paging->common.domain; if (hwpt_paging->enforce_cache_coherency) return 0; if (paging_domain->ops->enforce_cache_coherency) hwpt_paging->enforce_cache_coherency = paging_domain->ops->enforce_cache_coherency( paging_domain); if (!hwpt_paging->enforce_cache_coherency) return -EINVAL; return 0; } /** * iommufd_hwpt_paging_alloc() - Get a PAGING iommu_domain for a device * @ictx: iommufd context * @ioas: IOAS to associate the domain with * @idev: Device to get an iommu_domain for * @pasid: PASID to get an iommu_domain for * @flags: Flags from userspace * @immediate_attach: True if idev should be attached to the hwpt * @user_data: The user provided driver specific data describing the domain to * create * * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT * will be linked to the given ioas and upon return the underlying iommu_domain * is fully popoulated. * * The caller must hold the ioas->mutex until after * iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on * the returned hwpt. */ struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, ioasid_t pasid, u32 flags, bool immediate_attach, const struct iommu_user_data *user_data) { const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID; const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; int rc; lockdep_assert_held(&ioas->mutex); if ((flags || user_data) && !ops->domain_alloc_paging_flags) return ERR_PTR(-EOPNOTSUPP); if (flags & ~valid_flags) return ERR_PTR(-EOPNOTSUPP); if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && !device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) return ERR_PTR(-EOPNOTSUPP); if ((flags & IOMMU_HWPT_FAULT_ID_VALID) && (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)) return ERR_PTR(-EOPNOTSUPP); hwpt_paging = __iommufd_object_alloc( ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj); if (IS_ERR(hwpt_paging)) return ERR_CAST(hwpt_paging); hwpt = &hwpt_paging->common; hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; INIT_LIST_HEAD(&hwpt_paging->hwpt_item); /* Pairs with iommufd_hw_pagetable_destroy() */ refcount_inc(&ioas->obj.users); hwpt_paging->ioas = ioas; hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; if (ops->domain_alloc_paging_flags) { hwpt->domain = ops->domain_alloc_paging_flags(idev->dev, flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; goto out_abort; } hwpt->domain->owner = ops; } else { hwpt->domain = iommu_paging_domain_alloc(idev->dev); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; goto out_abort; } } hwpt->domain->iommufd_hwpt = hwpt; hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; /* * Set the coherency mode before we do iopt_table_add_domain() as some * iommus have a per-PTE bit that controls it and need to decide before * doing any maps. It is an iommu driver bug to report * IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on * a new domain. * * The cache coherency mode must be configured here and unchanged later. * Note that a HWPT (non-CC) created for a device (non-CC) can be later * reused by another device (either non-CC or CC). However, A HWPT (CC) * created for a device (CC) cannot be reused by another device (non-CC) * but only devices (CC). Instead user space in this case would need to * allocate a separate HWPT (non-CC). */ if (idev->enforce_cache_coherency) { rc = iommufd_hwpt_paging_enforce_cc(hwpt_paging); if (WARN_ON(rc)) goto out_abort; } /* * immediate_attach exists only to accommodate iommu drivers that cannot * directly allocate a domain. These drivers do not finish creating the * domain until attach is completed. Thus we must have this call * sequence. Once those drivers are fixed this should be removed. */ if (immediate_attach) { rc = iommufd_hw_pagetable_attach(hwpt, idev, pasid); if (rc) goto out_abort; } rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain); if (rc) goto out_detach; list_add_tail(&hwpt_paging->hwpt_item, &ioas->hwpt_list); return hwpt_paging; out_detach: if (immediate_attach) iommufd_hw_pagetable_detach(idev, pasid); out_abort: iommufd_object_abort_and_destroy(ictx, &hwpt->obj); return ERR_PTR(rc); } /** * iommufd_hwpt_nested_alloc() - Get a NESTED iommu_domain for a device * @ictx: iommufd context * @parent: Parent PAGING-type hwpt to associate the domain with * @idev: Device to get an iommu_domain for * @flags: Flags from userspace * @user_data: user_data pointer. Must be valid * * Allocate a new iommu_domain (must be IOMMU_DOMAIN_NESTED) and return it as * a NESTED hw_pagetable. The given parent PAGING-type hwpt must be capable of * being a parent. */ static struct iommufd_hwpt_nested * iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, struct iommufd_hwpt_paging *parent, struct iommufd_device *idev, u32 flags, const struct iommu_user_data *user_data) { const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hwpt_nested *hwpt_nested; struct iommufd_hw_pagetable *hwpt; int rc; if ((flags & ~(IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID)) || !user_data->len || !ops->domain_alloc_nested) return ERR_PTR(-EOPNOTSUPP); if (parent->auto_domain || !parent->nest_parent || parent->common.domain->owner != ops) return ERR_PTR(-EINVAL); hwpt_nested = __iommufd_object_alloc( ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj); if (IS_ERR(hwpt_nested)) return ERR_CAST(hwpt_nested); hwpt = &hwpt_nested->common; hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; refcount_inc(&parent->common.obj.users); hwpt_nested->parent = parent; hwpt->domain = ops->domain_alloc_nested( idev->dev, parent->common.domain, flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; goto out_abort; } hwpt->domain->owner = ops; hwpt->domain->iommufd_hwpt = hwpt; hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EOPNOTSUPP; goto out_abort; } return hwpt_nested; out_abort: iommufd_object_abort_and_destroy(ictx, &hwpt->obj); return ERR_PTR(rc); } /** * iommufd_viommu_alloc_hwpt_nested() - Get a hwpt_nested for a vIOMMU * @viommu: vIOMMU ojbect to associate the hwpt_nested/domain with * @flags: Flags from userspace * @user_data: user_data pointer. Must be valid * * Allocate a new IOMMU_DOMAIN_NESTED for a vIOMMU and return it as a NESTED * hw_pagetable. */ static struct iommufd_hwpt_nested * iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, const struct iommu_user_data *user_data) { struct iommufd_hwpt_nested *hwpt_nested; struct iommufd_hw_pagetable *hwpt; int rc; if (flags & ~(IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID)) return ERR_PTR(-EOPNOTSUPP); if (!user_data->len) return ERR_PTR(-EOPNOTSUPP); if (!viommu->ops || !viommu->ops->alloc_domain_nested) return ERR_PTR(-EOPNOTSUPP); hwpt_nested = __iommufd_object_alloc( viommu->ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj); if (IS_ERR(hwpt_nested)) return ERR_CAST(hwpt_nested); hwpt = &hwpt_nested->common; hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; hwpt_nested->viommu = viommu; refcount_inc(&viommu->obj.users); hwpt_nested->parent = viommu->hwpt; hwpt->domain = viommu->ops->alloc_domain_nested( viommu, flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; goto out_abort; } hwpt->domain->iommufd_hwpt = hwpt; hwpt->domain->owner = viommu->iommu_dev->ops; hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EOPNOTSUPP; goto out_abort; } return hwpt_nested; out_abort: iommufd_object_abort_and_destroy(viommu->ictx, &hwpt->obj); return ERR_PTR(rc); } int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; const struct iommu_user_data user_data = { .type = cmd->data_type, .uptr = u64_to_user_ptr(cmd->data_uptr), .len = cmd->data_len, }; struct iommufd_hw_pagetable *hwpt; struct iommufd_ioas *ioas = NULL; struct iommufd_object *pt_obj; struct iommufd_device *idev; int rc; if (cmd->__reserved) return -EOPNOTSUPP; if ((cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len) || (cmd->data_type != IOMMU_HWPT_DATA_NONE && !cmd->data_len)) return -EINVAL; idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) return PTR_ERR(idev); pt_obj = iommufd_get_object(ucmd->ictx, cmd->pt_id, IOMMUFD_OBJ_ANY); if (IS_ERR(pt_obj)) { rc = -EINVAL; goto out_put_idev; } if (pt_obj->type == IOMMUFD_OBJ_IOAS) { struct iommufd_hwpt_paging *hwpt_paging; ioas = container_of(pt_obj, struct iommufd_ioas, obj); mutex_lock(&ioas->mutex); hwpt_paging = iommufd_hwpt_paging_alloc( ucmd->ictx, ioas, idev, IOMMU_NO_PASID, cmd->flags, false, user_data.len ? &user_data : NULL); if (IS_ERR(hwpt_paging)) { rc = PTR_ERR(hwpt_paging); goto out_unlock; } hwpt = &hwpt_paging->common; } else if (pt_obj->type == IOMMUFD_OBJ_HWPT_PAGING) { struct iommufd_hwpt_nested *hwpt_nested; hwpt_nested = iommufd_hwpt_nested_alloc( ucmd->ictx, container_of(pt_obj, struct iommufd_hwpt_paging, common.obj), idev, cmd->flags, &user_data); if (IS_ERR(hwpt_nested)) { rc = PTR_ERR(hwpt_nested); goto out_unlock; } hwpt = &hwpt_nested->common; } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) { struct iommufd_hwpt_nested *hwpt_nested; struct iommufd_viommu *viommu; viommu = container_of(pt_obj, struct iommufd_viommu, obj); if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) { rc = -EINVAL; goto out_unlock; } hwpt_nested = iommufd_viommu_alloc_hwpt_nested( viommu, cmd->flags, &user_data); if (IS_ERR(hwpt_nested)) { rc = PTR_ERR(hwpt_nested); goto out_unlock; } hwpt = &hwpt_nested->common; } else { rc = -EINVAL; goto out_put_pt; } if (cmd->flags & IOMMU_HWPT_FAULT_ID_VALID) { struct iommufd_fault *fault; fault = iommufd_get_fault(ucmd, cmd->fault_id); if (IS_ERR(fault)) { rc = PTR_ERR(fault); goto out_hwpt; } hwpt->fault = fault; hwpt->domain->iopf_handler = iommufd_fault_iopf_handler; refcount_inc(&fault->common.obj.users); iommufd_put_object(ucmd->ictx, &fault->common.obj); } cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_hwpt; iommufd_object_finalize(ucmd->ictx, &hwpt->obj); goto out_unlock; out_hwpt: iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj); out_unlock: if (ioas) mutex_unlock(&ioas->mutex); out_put_pt: iommufd_put_object(ucmd->ictx, pt_obj); out_put_idev: iommufd_put_object(ucmd->ictx, &idev->obj); return rc; } int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd; struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = -EOPNOTSUPP; bool enable; if (cmd->flags & ~IOMMU_HWPT_DIRTY_TRACKING_ENABLE) return rc; hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); if (IS_ERR(hwpt_paging)) return PTR_ERR(hwpt_paging); ioas = hwpt_paging->ioas; enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE; rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain, enable); iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); return rc; } int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd; struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = -EOPNOTSUPP; if ((cmd->flags & ~(IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR)) || cmd->__reserved) return -EOPNOTSUPP; hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); if (IS_ERR(hwpt_paging)) return PTR_ERR(hwpt_paging); ioas = hwpt_paging->ioas; rc = iopt_read_and_clear_dirty_data( &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd); iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); return rc; } int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_invalidate *cmd = ucmd->cmd; struct iommu_user_data_array data_array = { .type = cmd->data_type, .uptr = u64_to_user_ptr(cmd->data_uptr), .entry_len = cmd->entry_len, .entry_num = cmd->entry_num, }; struct iommufd_object *pt_obj; u32 done_num = 0; int rc; if (cmd->__reserved) { rc = -EOPNOTSUPP; goto out; } if (cmd->entry_num && (!cmd->data_uptr || !cmd->entry_len)) { rc = -EINVAL; goto out; } pt_obj = iommufd_get_object(ucmd->ictx, cmd->hwpt_id, IOMMUFD_OBJ_ANY); if (IS_ERR(pt_obj)) { rc = PTR_ERR(pt_obj); goto out; } if (pt_obj->type == IOMMUFD_OBJ_HWPT_NESTED) { struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); if (!hwpt->domain->ops || !hwpt->domain->ops->cache_invalidate_user) { rc = -EOPNOTSUPP; goto out_put_pt; } rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, &data_array); } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) { struct iommufd_viommu *viommu = container_of(pt_obj, struct iommufd_viommu, obj); if (!viommu->ops || !viommu->ops->cache_invalidate) { rc = -EOPNOTSUPP; goto out_put_pt; } rc = viommu->ops->cache_invalidate(viommu, &data_array); } else { rc = -EINVAL; goto out_put_pt; } done_num = data_array.entry_num; out_put_pt: iommufd_put_object(ucmd->ictx, pt_obj); out: cmd->entry_num = done_num; if (iommufd_ucmd_respond(ucmd, sizeof(*cmd))) return -EFAULT; return rc; }
68 68 68 68 64 68 4 4 4 64 64 64 64 99 98 10 10 7 89 89 89 89 89 89 89 89 89 89 89 3 3 3 3 3 3 3 3 3 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 64 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 89 62 64 64 63 64 64 64 64 64 64 63 64 64 63 64 63 63 64 64 64 89 89 89 89 88 89 89 89 89 89 89 89 89 64 64 63 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006 Thomas Gleixner * * This file contains driver APIs to the irq subsystem. */ #define pr_fmt(fmt) "genirq: " fmt #include <linux/irq.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include <linux/irqdomain.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/sched/task.h> #include <linux/sched/isolation.h> #include <uapi/linux/sched/types.h> #include <linux/task_work.h> #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); static int __init setup_forced_irqthreads(char *arg) { static_branch_enable(&force_irqthreads_key); return 0; } early_param("threadirqs", setup_forced_irqthreads); #endif static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); bool inprogress; do { /* * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ guard(raw_spinlock_irqsave)(&desc->lock); inprogress = irqd_irq_inprogress(&desc->irq_data); /* * If requested and supported, check at the chip whether it * is in flight at the hardware level, i.e. already pending * in a CPU and waiting for service and acknowledge. */ if (!inprogress && sync_chip) { /* * Ignore the return code. inprogress is only updated * when the chip supports it. */ __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, &inprogress); } /* Oops, that failed? */ } while (inprogress); } /** * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending hard IRQ handlers for this interrupt * to complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. It does not take * associated threaded handlers into account. * * Do not use this for shutdown scenarios where you must be sure that all * parts (hardirq and threaded handler) have completed. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. * * It does not check whether there is an interrupt in flight at the * hardware level, but not serviced yet, as this might deadlock when called * with interrupts disabled and the target CPU of the interrupt is the * current CPU. */ bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { __synchronize_hardirq(desc, false); return !atomic_read(&desc->threads_active); } return true; } EXPORT_SYMBOL(synchronize_hardirq); static void __synchronize_irq(struct irq_desc *desc) { __synchronize_hardirq(desc, true); /* * We made sure that no hardirq handler is running. Now verify that no * threaded handlers are active. */ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); } /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending IRQ handlers for this interrupt to * complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when * an interrupt thread is associated to @irq. * * It optionally makes sure (when the irq chip supports that method) * that the interrupt is not pending in any CPU and waiting for * service. */ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) __synchronize_irq(desc); } EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return false; return true; } /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check * */ int irq_can_set_affinity(unsigned int irq) { return __irq_can_set_affinity(irq_to_desc(irq)); } /** * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space * @irq: Interrupt to check * * Like irq_can_set_affinity() above, but additionally checks for the * AFFINITY_MANAGED flag. */ bool irq_can_set_affinity_usr(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); return __irq_can_set_affinity(desc) && !irqd_affinity_is_managed(&desc->irq_data); } /** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affinity changed * * Just set IRQTF_AFFINITY and delegate the affinity setting to the * interrupt thread itself. We can not call set_cpus_allowed_ptr() here as * we hold desc->lock and this code can be called from hard interrupt * context. */ static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; for_each_action_of_desc(desc, action) { if (action->thread) { set_bit(IRQTF_AFFINITY, &action->thread_flags); wake_up_process(action->thread); } if (action->secondary && action->secondary->thread) { set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags); wake_up_process(action->secondary->thread); } } } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); if (!cpumask_empty(m)) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); } #else static inline void irq_validate_effective_affinity(struct irq_data *data) { } #endif static DEFINE_PER_CPU(struct cpumask, __tmp_mask); int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask); struct irq_desc *desc = irq_data_to_desc(data); struct irq_chip *chip = irq_data_get_irq_chip(data); const struct cpumask *prog_mask; int ret; if (!chip || !chip->irq_set_affinity) return -EINVAL; /* * If this is a managed interrupt and housekeeping is enabled on * it check whether the requested affinity mask intersects with * a housekeeping CPU. If so, then remove the isolated CPUs from * the mask and just keep the housekeeping CPU(s). This prevents * the affinity setter from routing the interrupt to an isolated * CPU to avoid that I/O submitted from a housekeeping CPU causes * interrupts on an isolated one. * * If the masks do not intersect or include online CPU(s) then * keep the requested mask. The isolated target CPUs are only * receiving interrupts when the I/O operation was submitted * directly from them. * * If all housekeeping CPUs in the affinity mask are offline, the * interrupt will be migrated by the CPU hotplug code once a * housekeeping CPU which belongs to the affinity mask comes * online. */ if (irqd_affinity_is_managed(data) && housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { const struct cpumask *hk_mask; hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); cpumask_and(tmp_mask, mask, hk_mask); if (!cpumask_intersects(tmp_mask, cpu_online_mask)) prog_mask = mask; else prog_mask = tmp_mask; } else { prog_mask = mask; } /* * Make sure we only provide online CPUs to the irqchip, * unless we are being asked to force the affinity (in which * case we do as we are told). */ cpumask_and(tmp_mask, prog_mask, cpu_online_mask); if (!force && !cpumask_empty(tmp_mask)) ret = chip->irq_set_affinity(data, tmp_mask, force); else if (force) ret = chip->irq_set_affinity(data, mask, force); else ret = -EINVAL; switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); ret = 0; } return ret; } #ifdef CONFIG_GENERIC_PENDING_IRQ static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { struct irq_desc *desc = irq_data_to_desc(data); irqd_set_move_pending(data); irq_copy_pending(desc, dest); return 0; } #else static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { return -EBUSY; } #endif static int irq_try_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { int ret = irq_do_set_affinity(data, dest, force); /* * In case that the underlying vector management is busy and the * architecture supports the generic pending mechanism then utilize * this to avoid returning an error to user space. */ if (ret == -EBUSY && !force) ret = irq_set_affinity_pending(data, dest); return ret; } static bool irq_set_affinity_deactivated(struct irq_data *data, const struct cpumask *mask) { struct irq_desc *desc = irq_data_to_desc(data); /* * Handle irq chips which can handle affinity only in activated * state correctly * * If the interrupt is not yet activated, just store the affinity * mask and do not call the chip driver at all. On activation the * driver has to make sure anyway that the interrupt is in a * usable state so startup works. */ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data) || !irqd_affinity_on_activate(data)) return false; cpumask_copy(desc->irq_common_data.affinity, mask); irq_data_update_effective_affinity(data, mask); irqd_set(data, IRQD_AFFINITY_SET); return true; } int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { struct irq_chip *chip = irq_data_get_irq_chip(data); struct irq_desc *desc = irq_data_to_desc(data); int ret = 0; if (!chip || !chip->irq_set_affinity) return -EINVAL; if (irq_set_affinity_deactivated(data, mask)) return 0; if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); } if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); if (!schedule_work(&desc->affinity_notify->work)) { /* Work was already scheduled, drop our extra ref */ kref_put(&desc->affinity_notify->kref, desc->affinity_notify->release); } } irqd_set(data, IRQD_AFFINITY_SET); return ret; } /** * irq_update_affinity_desc - Update affinity management for an interrupt * @irq: The interrupt number to update * @affinity: Pointer to the affinity descriptor * * This interface can be used to configure the affinity management of * interrupts which have been allocated already. * * There are certain limitations on when it may be used - attempts to use it * for when the kernel is configured for generic IRQ reservation mode (in * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with * managed/non-managed interrupt accounting. In addition, attempts to use it on * an interrupt which is already started or which has already been configured * as managed will also fail, as these mean invalid init state or double init. */ int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { /* * Supporting this with the reservation scheme used by x86 needs * some more thought. Fail it for now. */ if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) return -EOPNOTSUPP; scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_desc *desc = scoped_irqdesc; bool activated; /* Requires the interrupt to be shut down */ if (irqd_is_started(&desc->irq_data)) return -EBUSY; /* Interrupts which are already managed cannot be modified */ if (irqd_affinity_is_managed(&desc->irq_data)) return -EBUSY; /* * Deactivate the interrupt. That's required to undo * anything an earlier activation has established. */ activated = irqd_is_activated(&desc->irq_data); if (activated) irq_domain_deactivate_irq(&desc->irq_data); if (affinity->is_managed) { irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); } cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); /* Restore the activation state */ if (activated) irq_domain_activate_irq(&desc->irq_data, false); return 0; } return -EINVAL; } static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); if (!desc) return -EINVAL; guard(raw_spinlock_irqsave)(&desc->lock); return irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); } /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Fails if cpumask does not contain an online CPU */ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, false); } EXPORT_SYMBOL_GPL(irq_set_affinity); /** * irq_force_affinity - Force the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Same as irq_set_affinity, but without checking the mask against * online cpus. * * Solely for low level cpu hotplug code, where we need to make per * cpu interrupts affine before the cpu becomes online. */ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, true); } EXPORT_SYMBOL_GPL(irq_force_affinity); int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity) { int ret = -EINVAL; scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { scoped_irqdesc->affinity_hint = m; ret = 0; } if (!ret && m && setaffinity) __irq_set_affinity(irq, m, false); return ret; } EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work); struct irq_desc *desc = irq_to_desc(notify->irq); cpumask_var_t cpumask; if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) goto out; scoped_guard(raw_spinlock_irqsave, &desc->lock) { if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else cpumask_copy(cpumask, desc->irq_common_data.affinity); } notify->notify(notify, cpumask); free_cpumask_var(cpumask); out: kref_put(&notify->kref, notify->release); } /** * irq_set_affinity_notifier - control notification of IRQ affinity changes * @irq: Interrupt for which to enable/disable notification * @notify: Context for notification, or %NULL to disable * notification. Function pointers must be initialised; * the other fields will be initialised by this function. * * Must be called in process context. Notification may only be enabled * after the IRQ is allocated and must be disabled before the IRQ is freed * using free_irq(). */ int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { struct irq_desc *desc = irq_to_desc(irq); struct irq_affinity_notify *old_notify; /* The release function is promised process context */ might_sleep(); if (!desc || irq_is_nmi(desc)) return -EINVAL; /* Complete initialisation of *notify */ if (notify) { notify->irq = irq; kref_init(&notify->kref); INIT_WORK(&notify->work, irq_affinity_notify); } scoped_guard(raw_spinlock_irqsave, &desc->lock) { old_notify = desc->affinity_notify; desc->affinity_notify = notify; } if (old_notify) { if (cancel_work_sync(&old_notify->work)) { /* Pending work had a ref, put that one too */ kref_put(&old_notify->kref, old_notify->release); } kref_put(&old_notify->kref, old_notify->release); } return 0; } EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. */ int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; int node = irq_desc_get_node(desc); static DEFINE_RAW_SPINLOCK(mask_lock); static struct cpumask mask; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!__irq_can_set_affinity(desc)) return 0; guard(raw_spinlock)(&mask_lock); /* * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. */ if (irqd_affinity_is_managed(&desc->irq_data) || irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) set = desc->irq_common_data.affinity; else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } cpumask_and(&mask, cpu_online_mask, set); if (cpumask_empty(&mask)) cpumask_copy(&mask, cpu_online_mask); if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); /* make sure at least one of the cpus in nodemask is online */ if (cpumask_intersects(&mask, nodemask)) cpumask_and(&mask, &mask, nodemask); } return irq_do_set_affinity(&desc->irq_data, &mask, false); } #else /* Wrapper for ALPHA specific affinity selector magic */ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } #endif /* CONFIG_AUTO_IRQ_AFFINITY */ #endif /* CONFIG_SMP */ /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt * @irq: interrupt number to set affinity * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU * specific data for percpu_devid interrupts * * This function uses the vCPU specific data to set the vCPU affinity for * an irq. The vCPU specific data is passed from outside, such as KVM. One * example code path is as below: KVM -> IOMMU -> irq_set_vcpu_affinity(). */ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) { scoped_irqdesc_get_and_lock(irq, 0) { struct irq_desc *desc = scoped_irqdesc; struct irq_data *data; struct irq_chip *chip; data = irq_desc_get_irq_data(desc); do { chip = irq_data_get_irq_chip(data); if (chip && chip->irq_set_vcpu_affinity) break; data = irqd_get_parent_data(data); } while (data); if (!data) return -ENOSYS; return chip->irq_set_vcpu_affinity(data, vcpu_info); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); void __disable_irq(struct irq_desc *desc) { if (!desc->depth++) irq_disable(desc); } static int __disable_irq_nosync(unsigned int irq) { scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { __disable_irq(scoped_irqdesc); return 0; } return -EINVAL; } /** * disable_irq_nosync - disable an irq without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and Enables are * nested. * Unlike disable_irq(), this function does not ensure existing * instances of the IRQ handler have completed before returning. * * This function may be called from IRQ context. */ void disable_irq_nosync(unsigned int irq) { __disable_irq_nosync(irq); } EXPORT_SYMBOL(disable_irq_nosync); /** * disable_irq - disable an irq and wait for completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are nested. * * This function waits for any pending IRQ handlers for this interrupt to * complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when an * interrupt thread is associated to @irq. * */ void disable_irq(unsigned int irq) { might_sleep(); if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } EXPORT_SYMBOL(disable_irq); /** * disable_hardirq - disables an irq and waits for hardirq completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are nested. * * This function waits for any pending hard IRQ handlers for this interrupt * to complete before returning. If you use this function while holding a * resource the hard IRQ handler may need you will deadlock. * * When used to optimistically disable an interrupt from atomic context the * return value must be checked. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. */ bool disable_hardirq(unsigned int irq) { if (!__disable_irq_nosync(irq)) return synchronize_hardirq(irq); return false; } EXPORT_SYMBOL_GPL(disable_hardirq); /** * disable_nmi_nosync - disable an nmi without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and enables are nested. * * The interrupt to disable must have been requested through request_nmi. * Unlike disable_nmi(), this function does not ensure existing * instances of the IRQ handler have completed before returning. */ void disable_nmi_nosync(unsigned int irq) { disable_irq_nosync(irq); } void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { case 0: err_out: WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq_desc_get_irq(desc)); break; case 1: { if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); /* * Call irq_startup() not irq_enable() here because the * interrupt might be marked NOAUTOEN so irq_startup() * needs to be invoked when it gets enabled the first time. * This is also required when __enable_irq() is invoked for * a managed and shutdown interrupt from the S3 resume * path. * * If it was already started up, then irq_startup() will * invoke irq_enable() under the hood. */ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); break; } default: desc->depth--; } } /** * enable_irq - enable handling of an irq * @irq: Interrupt to enable * * Undoes the effect of one call to disable_irq(). If this matches the * last disable, processing of interrupts on this IRQ line is re-enabled. * * This function may be called from IRQ context only when * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq)) return; __enable_irq(desc); } } EXPORT_SYMBOL(enable_irq); /** * enable_nmi - enable handling of an nmi * @irq: Interrupt to enable * * The interrupt to enable must have been requested through request_nmi. * Undoes the effect of one call to disable_nmi(). If this matches the last * disable, processing of interrupts on this IRQ line is re-enabled. */ void enable_nmi(unsigned int irq) { enable_irq(irq); } static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE) return 0; if (desc->irq_data.chip->irq_set_wake) ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); return ret; } /** * irq_set_irq_wake - control irq power management wakeup * @irq: interrupt to control * @on: enable/disable power management wakeup * * Enable/disable power management wakeup mode, which is disabled by * default. Enables and disables must match, just as they match for * non-wakeup mode support. * * Wakeup mode lets this IRQ wake the system from sleep states like * "suspend to RAM". * * Note: irq enable/disable state is completely orthogonal to the * enable/disable state of irq wake. An irq can be disabled with * disable_irq() and still wake the system as long as the irq has wake * enabled. If this does not hold, then the underlying irq chip and the * related driver need to be investigated. */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; int ret = 0; /* Don't use NMIs as wake up interrupts please */ if (irq_is_nmi(desc)) return -EINVAL; /* * wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 0; else irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); } } else { if (desc->wake_depth == 0) { WARN(1, "Unbalanced IRQ %d wake disable\n", irq); } else if (--desc->wake_depth == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 1; else irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } return ret; } return -EINVAL; } EXPORT_SYMBOL(irq_set_irq_wake); /* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. */ bool can_request_irq(unsigned int irq, unsigned long irqflags) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; if (irq_settings_can_request(desc)) { if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED) return true; } } return false; } int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) { struct irq_chip *chip = desc->irq_data.chip; int ret, unmask = 0; if (!chip || !chip->irq_set_type) { /* * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ pr_debug("No set_type function for IRQ %d (%s)\n", irq_desc_get_irq(desc), chip ? (chip->name ? : "unknown") : "unknown"); return 0; } if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); if (!irqd_irq_disabled(&desc->irq_data)) unmask = 1; } /* Mask all flags except trigger mode */ flags &= IRQ_TYPE_SENSE_MASK; ret = chip->irq_set_type(&desc->irq_data, flags); switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); irq_settings_set_trigger_mask(desc, flags); irqd_clear(&desc->irq_data, IRQD_LEVEL); irq_settings_clr_level(desc); if (flags & IRQ_TYPE_LEVEL_MASK) { irq_settings_set_level(desc); irqd_set(&desc->irq_data, IRQD_LEVEL); } ret = 0; break; default: pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) unmask_irq(desc); return ret; } #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq) { scoped_irqdesc_get_and_lock(irq, 0) { scoped_irqdesc->parent_irq = parent_irq; return 0; } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* * Default primary interrupt handler for threaded interrupts. Is * assigned as primary handler when request_threaded_irq is called * with handler == NULL. Useful for oneshot interrupts. */ static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) { return IRQ_WAKE_THREAD; } /* * Primary handler for nested threaded interrupts. Should never be * called. */ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) { WARN(1, "Primary handler called for nested irq %d\n", irq); return IRQ_NONE; } static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) { WARN(1, "Secondary action handler called for irq %d\n", irq); return IRQ_NONE; } #ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; bool valid = false; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; __set_current_state(TASK_RUNNING); /* * In case we are out of memory we set IRQTF_AFFINITY again and * try again next time */ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { set_bit(IRQTF_AFFINITY, &action->thread_flags); return; } scoped_guard(raw_spinlock_irq, &desc->lock) { /* * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ if (cpumask_available(desc->irq_common_data.affinity)) { const struct cpumask *m; m = irq_data_get_effective_affinity_mask(&desc->irq_data); cpumask_copy(mask, m); valid = true; } } if (valid) set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif static int irq_wait_for_interrupt(struct irq_desc *desc, struct irqaction *action) { for (;;) { set_current_state(TASK_INTERRUPTIBLE); irq_thread_check_affinity(desc, action); if (kthread_should_stop()) { /* may need to run one last time */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } __set_current_state(TASK_RUNNING); return -1; } if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } schedule(); } } /* * Oneshot interrupts keep the irq line masked until the threaded * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ static void irq_finalize_oneshot(struct irq_desc *desc, struct irqaction *action) { if (!(desc->istate & IRQS_ONESHOT) || action->handler == irq_forced_secondary_handler) return; again: chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); /* * Implausible though it may be we need to protect us against * the following scenario: * * The thread is faster done than the hard interrupt handler * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due * to IRQS_INPROGRESS and the irq line is masked forever. * * This also serializes the state of shared oneshot handlers * versus "desc->threads_oneshot |= action->thread_mask;" in * irq_wake_thread(). See the comment there which explains the * serialization. */ if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); goto again; } /* * Now check again, whether the thread should run. Otherwise * we would clear the threads_oneshot bit of this thread which * was just set. */ if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) goto out_unlock; desc->threads_oneshot &= ~action->thread_mask; if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) unmask_threaded_irq(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); } /* * Interrupts explicitly requested as threaded interrupts want to be * preemptible - many of them need to sleep and wait for slow busses to * complete. */ static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret = action->thread_fn(action->irq, action->dev_id); if (ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); irq_finalize_oneshot(desc, action); return ret; } /* * Interrupts which are not explicitly requested as threaded * interrupts rely on the implicit bh/preempt disable of the hard irq * context. So we need to disable bh here to avoid deadlocks and other * side effects. */ static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret; local_bh_disable(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); ret = irq_thread_fn(desc, action); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); local_bh_enable(); return ret; } void wake_threads_waitq(struct irq_desc *desc) { if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } static void irq_thread_dtor(struct callback_head *unused) { struct task_struct *tsk = current; struct irq_desc *desc; struct irqaction *action; if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) return; action = kthread_data(tsk); pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm, tsk->pid, action->irq); desc = irq_to_desc(action->irq); /* * If IRQTF_RUNTHREAD is set, we need to decrement * desc->threads_active and wake possible waiters. */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) wake_threads_waitq(desc); /* Prevent a stale desc->threads_oneshot */ irq_finalize_oneshot(desc, action); } static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) { struct irqaction *secondary = action->secondary; if (WARN_ON_ONCE(!secondary)) return; guard(raw_spinlock_irq)(&desc->lock); __irq_wake_thread(desc, secondary); } /* * Internal function to notify that a interrupt thread is ready. */ static void irq_thread_set_ready(struct irq_desc *desc, struct irqaction *action) { set_bit(IRQTF_READY, &action->thread_flags); wake_up(&desc->wait_for_threads); } /* * Internal function to wake up a interrupt thread and wait until it is * ready. */ static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc, struct irqaction *action) { if (!action || !action->thread) return; wake_up_process(action->thread); wait_event(desc->wait_for_threads, test_bit(IRQTF_READY, &action->thread_flags)); } /* * Interrupt handler thread */ static int irq_thread(void *data) { struct callback_head on_exit_work; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); irq_thread_set_ready(desc, action); sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; else handler_fn = irq_thread_fn; init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, TWA_NONE); while (!irq_wait_for_interrupt(desc, action)) { irqreturn_t action_ret; action_ret = handler_fn(desc, action); if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); wake_threads_waitq(desc); } /* * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the * oneshot mask bit can be set. */ task_work_cancel_func(current, irq_thread_dtor); return 0; } /** * irq_wake_thread - wake the irq thread for the action identified by dev_id * @irq: Interrupt line * @dev_id: Device identity for which the thread should be woken */ void irq_wake_thread(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return; guard(raw_spinlock_irqsave)(&desc->lock); for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); break; } } } EXPORT_SYMBOL_GPL(irq_wake_thread); static int irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads()) return 0; if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; /* * No further action required for interrupts which are requested as * threaded interrupts already */ if (new->handler == irq_default_primary_handler) return 0; new->flags |= IRQF_ONESHOT; /* * Handle the case where we have a real primary handler and a * thread handler. We force thread them as well by creating a * secondary action. */ if (new->handler && new->thread_fn) { /* Allocate the secondary action */ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!new->secondary) return -ENOMEM; new->secondary->handler = irq_forced_secondary_handler; new->secondary->thread_fn = new->thread_fn; new->secondary->dev_id = new->dev_id; new->secondary->irq = new->irq; new->secondary->name = new->name; } /* Deal with the primary handler */ set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); new->thread_fn = new->handler; new->handler = irq_default_primary_handler; return 0; } static int irq_request_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; return c->irq_request_resources ? c->irq_request_resources(d) : 0; } static void irq_release_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; if (c->irq_release_resources) c->irq_release_resources(d); } static bool irq_supports_nmi(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /* Only IRQs directly managed by the root irqchip can be set as NMI */ if (d->parent_data) return false; #endif /* Don't support NMIs for chips behind a slow bus */ if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock) return false; return d->chip->flags & IRQCHIP_SUPPORTS_NMI; } static int irq_nmi_setup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL; } static void irq_nmi_teardown(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; if (c->irq_nmi_teardown) c->irq_nmi_teardown(d); } static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { struct task_struct *t; if (!secondary) { t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); } if (IS_ERR(t)) return PTR_ERR(t); /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code * references an already freed task_struct. */ new->thread = get_task_struct(t); /* * Tell the thread to set its affinity. This is * important for shared interrupt handlers as we do * not invoke setup_affinity() for the secondary * handlers as everything is already set up. Even for * interrupts marked with IRQF_NO_BALANCE this is * correct as we want the thread to move to the cpu(s) * on which the requesting code placed the interrupt. */ set_bit(IRQTF_AFFINITY, &new->thread_flags); return 0; } /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. * * Locking rules: * * desc->request_mutex Provides serialization against a concurrent free_irq() * chip_bus_lock Provides serialization for slow bus operations * desc->lock Provides serialization against hard interrupts * * chip_bus_lock and desc->lock are sufficient for all other management and * interrupt related functions. desc->request_mutex solely serializes * request/free_irq(). */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; if (!desc) return -EINVAL; if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; if (!try_module_get(desc->owner)) return -ENODEV; new->irq = irq; /* * If the trigger type is not specified by the caller, * then use the default for this interrupt. */ if (!(new->flags & IRQF_TRIGGER_MASK)) new->flags |= irqd_get_trigger_type(&desc->irq_data); /* * Check whether the interrupt nests into another interrupt * thread. */ nested = irq_settings_is_nested_thread(desc); if (nested) { if (!new->thread_fn) { ret = -EINVAL; goto out_mput; } /* * Replace the primary handler which was provided from * the driver for non nested interrupt handling by the * dummy function which warns when called. */ new->handler = irq_nested_primary_handler; } else { if (irq_settings_can_thread(desc)) { ret = irq_setup_forced_threading(new); if (ret) goto out_mput; } } /* * Create a handler thread when a thread function is supplied * and the interrupt does not nest into another interrupt * thread. */ if (new->thread_fn && !nested) { ret = setup_irq_thread(new, irq, false); if (ret) goto out_mput; if (new->secondary) { ret = setup_irq_thread(new->secondary, irq, true); if (ret) goto out_thread; } } /* * Drivers are often written to work w/o knowledge about the * underlying irq chip implementation, so a request for a * threaded irq without a primary hard irq context handler * requires the ONESHOT flag to be set. Some irq chips like * MSI based interrupts are per se one shot safe. Check the * chip flags, so we can avoid the unmask dance at the end of * the threaded handler for those. */ if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; /* * Protects against a concurrent __free_irq() call which might wait * for synchronize_hardirq() to complete without holding the optional * chip bus lock and desc->lock. Also protects against handing out * a recycled oneshot thread_mask bit while it's still in use by * its previous owner. */ mutex_lock(&desc->request_mutex); /* * Acquire bus lock as the irq_request_resources() callback below * might rely on the serialization or the magic power management * functions which are abusing the irq_bus_lock() callback, */ chip_bus_lock(desc); /* First installed action requests resources. */ if (!desc->action) { ret = irq_request_resources(desc); if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); goto out_bus_unlock; } } /* * The following block of code has to be executed atomically * protected against a concurrent interrupt and any of the other * management calls which are not serialized via * desc->request_mutex or the optional bus lock. */ raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; old = *old_ptr; if (old) { /* * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag * fields must have IRQF_SHARED set and the bits which * set the trigger type must match. Also all must * agree on ONESHOT. * Interrupt lines used for NMIs cannot be shared. */ unsigned int oldtype; if (irq_is_nmi(desc)) { pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", new->name, irq, desc->irq_data.chip->name); ret = -EINVAL; goto out_unlock; } /* * If nobody did set the configuration before, inherit * the one provided by the requester. */ if (irqd_trigger_type_was_set(&desc->irq_data)) { oldtype = irqd_get_trigger_type(&desc->irq_data); } else { oldtype = new->flags & IRQF_TRIGGER_MASK; irqd_set_trigger_type(&desc->irq_data, oldtype); } if (!((old->flags & new->flags) & IRQF_SHARED) || (oldtype != (new->flags & IRQF_TRIGGER_MASK))) goto mismatch; if ((old->flags & IRQF_ONESHOT) && (new->flags & IRQF_COND_ONESHOT)) new->flags |= IRQF_ONESHOT; else if ((old->flags ^ new->flags) & IRQF_ONESHOT) goto mismatch; /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) goto mismatch; /* add new interrupt at end of irq queue */ do { /* * Or all existing action->thread_mask bits, * so we can find the next zero bit for this * new action. */ thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } /* * Setup the thread mask for this irqaction for ONESHOT. For * !ONESHOT irqs the thread mask is 0 so we can avoid a * conditional in irq_wake_thread(). */ if (new->flags & IRQF_ONESHOT) { /* * Unlikely to have 32 resp 64 irqs sharing one line, * but who knows. */ if (thread_mask == ~0UL) { ret = -EBUSY; goto out_unlock; } /* * The thread_mask for the action is or'ed to * desc->thread_active to indicate that the * IRQF_ONESHOT thread handler has been woken, but not * yet finished. The bit is cleared when a thread * completes. When all threads of a shared interrupt * line have completed desc->threads_active becomes * zero and the interrupt line is unmasked. See * handle.c:irq_wake_thread() for further information. * * If no thread is woken by primary (hard irq context) * interrupt handlers, then desc->threads_active is * also checked for zero to unmask the irq line in the * affected hard irq flow handlers * (handle_[fasteoi|level]_irq). * * The new action gets the first zero bit of * thread_mask assigned. See the loop above which or's * all existing action->thread_mask bits. */ new->thread_mask = 1UL << ffz(thread_mask); } else if (new->handler == irq_default_primary_handler && !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it * does not have the oneshot flag set. In combination * with level interrupts this is deadly, because the * default primary handler just wakes the thread, then * the irq lines is reenabled, but the device still * has the level irq asserted. Rinse and repeat.... * * While this works for edge type interrupts, we play * it safe and reject unconditionally because we can't * say for sure which type this interrupt really * has. The type flags are unreliable as the * underlying chip implementation can override them. */ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n", new->name, irq); ret = -EINVAL; goto out_unlock; } if (!shared) { /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); if (ret) goto out_unlock; } /* * Activate the interrupt. That activation must happen * independently of IRQ_NOAUTOEN. request_irq() can fail * and the callers are supposed to handle * that. enable_irq() of an interrupt requested with * IRQ_NOAUTOEN is not supposed to fail. The activation * keeps it in shutdown mode, it merily associates * resources if necessary and if that's not possible it * fails. Interrupts which are in managed shutdown mode * will simply ignore that activation request. */ ret = irq_activate(desc); if (ret) goto out_unlock; desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_ONESHOT | IRQS_WAITING); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); if (new->flags & IRQF_PERCPU) { irqd_set(&desc->irq_data, IRQD_PER_CPU); irq_settings_set_per_cpu(desc); if (new->flags & IRQF_NO_DEBUG) irq_settings_set_no_debug(desc); } if (noirqdebug) irq_settings_set_no_debug(desc); if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; /* Exclude IRQ from balancing if requested */ if (new->flags & IRQF_NOBALANCING) { irq_settings_set_no_balancing(desc); irqd_set(&desc->irq_data, IRQD_NO_BALANCING); } if (!(new->flags & IRQF_NO_AUTOEN) && irq_settings_can_autoenable(desc)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); } else { /* * Shared interrupts do not go well with disabling * auto enable. The sharing interrupt might request * it while it's still disabled and then wait for * interrupts forever. */ WARN_ON_ONCE(new->flags & IRQF_SHARED); /* Undo nested disables: */ desc->depth = 1; } } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); if (nmsk != omsk) /* hope the handler works with current trigger mode */ pr_warn("irq %d uses trigger mode %u; requested %u\n", irq, omsk, nmsk); } *old_ptr = new; irq_pm_install_action(desc, new); /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; /* * Check whether we disabled the irq via the spurious handler * before. Reenable it and give it another chance. */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; __enable_irq(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); irq_setup_timings(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new->secondary); register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); return 0; mismatch: if (!(new->flags & IRQF_PROBE_SHARED)) { pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", irq, new->flags, new->name, old->flags, old->name); #ifdef CONFIG_DEBUG_SHIRQ dump_stack(); #endif } ret = -EBUSY; out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); if (!desc->action) irq_release_resources(desc); out_bus_unlock: chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); out_thread: if (new->thread) { struct task_struct *t = new->thread; new->thread = NULL; kthread_stop_put(t); } if (new->secondary && new->secondary->thread) { struct task_struct *t = new->secondary->thread; new->secondary->thread = NULL; kthread_stop_put(t); } out_mput: module_put(desc->owner); return ret; } /* * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. */ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) { unsigned irq = desc->irq_data.irq; struct irqaction *action, **action_ptr; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* * There can be multiple actions per IRQ descriptor, find the right * one based on the dev_id: */ action_ptr = &desc->action; for (;;) { action = *action_ptr; if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); return NULL; } if (action->dev_id == dev_id) break; action_ptr = &action->next; } /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; irq_pm_remove_action(desc, action); /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_settings_clr_disable_unlazy(desc); /* Only shutdown. Deactivate after synchronize_hardirq() */ irq_shutdown(desc); } #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ if (WARN_ON_ONCE(desc->affinity_hint)) desc->affinity_hint = NULL; #endif raw_spin_unlock_irqrestore(&desc->lock, flags); /* * Drop bus_lock here so the changes which were done in the chip * callbacks above are synced out to the irq chips which hang * behind a slow bus (I2C, SPI) before calling synchronize_hardirq(). * * Aside of that the bus_lock can also be taken from the threaded * handler in irq_finalize_oneshot() which results in a deadlock * because kthread_stop() would wait forever for the thread to * complete, which is blocked on the bus lock. * * The still held desc->request_mutex() protects against a * concurrent request_irq() of this irq so the release of resources * and timing data is properly serialized. */ chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); /* * Make sure it's not being used on another CPU and if the chip * supports it also make sure that there is no (not yet serviced) * interrupt in flight at the hardware level. */ __synchronize_irq(desc); #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ * event to happen even now it's being freed, so let's make sure that * is so by doing an extra call to the handler .... * * ( We do this after actually deregistering it, to make sure that a * 'real' IRQ doesn't run in parallel with our fake. ) */ if (action->flags & IRQF_SHARED) { local_irq_save(flags); action->handler(irq, dev_id); local_irq_restore(flags); } #endif /* * The action has already been removed above, but the thread writes * its oneshot mask bit when it completes. Though request_mutex is * held across this which prevents __setup_irq() from handing out * the same bit to a newly requested action. */ if (action->thread) { kthread_stop_put(action->thread); if (action->secondary && action->secondary->thread) kthread_stop_put(action->secondary->thread); } /* Last action releases resources */ if (!desc->action) { /* * Reacquire bus lock as irq_release_resources() might * require it to deallocate resources over the slow bus. */ chip_bus_lock(desc); /* * There is no interrupt on the fly anymore. Deactivate it * completely. */ scoped_guard(raw_spinlock_irqsave, &desc->lock) irq_domain_deactivate_irq(&desc->irq_data); irq_release_resources(desc); chip_bus_sync_unlock(desc); irq_remove_timings(desc); } mutex_unlock(&desc->request_mutex); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); return action; } /** * free_irq - free an interrupt allocated with request_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove an interrupt handler. The handler is removed and if the interrupt * line is no longer in use by any driver it is disabled. On a shared IRQ * the caller must ensure the interrupt is disabled on the card it drives * before calling this function. The function does not return until any * executing interrupts for this IRQ have completed. * * This function must not be called from interrupt context. * * Returns the devname argument passed to request_irq. */ const void *free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; const char *devname; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; #ifdef CONFIG_SMP if (WARN_ON(desc->affinity_notify)) desc->affinity_notify = NULL; #endif action = __free_irq(desc, dev_id); if (!action) return NULL; devname = action->name; kfree(action); return devname; } EXPORT_SYMBOL(free_irq); /* This function must be called with desc->lock held */ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) { const char *devname = NULL; desc->istate &= ~IRQS_NMI; if (!WARN_ON(desc->action == NULL)) { irq_pm_remove_action(desc, desc->action); devname = desc->action->name; unregister_handler_proc(irq, desc->action); kfree(desc->action); desc->action = NULL; } irq_settings_clr_disable_unlazy(desc); irq_shutdown_and_deactivate(desc); irq_release_resources(desc); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return devname; } const void *free_nmi(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || WARN_ON(!irq_is_nmi(desc))) return NULL; if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; /* NMI still enabled */ if (WARN_ON(desc->depth == 0)) disable_nmi_nosync(irq); guard(raw_spinlock_irqsave)(&desc->lock); irq_nmi_teardown(desc); return __cleanup_nmi(irq, desc); } /** * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts. * If handler is NULL and thread_fn != NULL * the default primary handler is installed. * @thread_fn: Function called from the irq handler thread * If NULL, no irq thread is created * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. From the point this call is made your handler function * may be invoked. Since your handler function must clear any interrupt the * board raises, you must take care both to initialise your hardware and to * set up the interrupt handler in the right order. * * If you want to set up a threaded irq handler for your device then you * need to supply @handler and @thread_fn. @handler is still called in hard * interrupt context and has to check whether the interrupt originates from * the device. If yes it needs to disable the interrupt on the device and * return IRQ_WAKE_THREAD which will wake up the handler thread and run * @thread_fn. This split handler design is necessary to support shared * interrupts. * * @dev_id must be globally unique. Normally the address of the device data * structure is used as the cookie. Since the handler receives this value * it makes sense to use it. * * If your interrupt is shared you must pass a non NULL dev_id as this is * required when freeing the interrupt. * * Flags: * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level * IRQF_ONESHOT Run thread_fn with interrupt line masked */ int request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, const char *devname, void *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). * * Also shared interrupts do not go well with disabling auto enable. * The sharing interrupt might request it while it's still disabled * and then wait for interrupts forever. * * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and * it cannot be set along with IRQF_NO_SUSPEND. */ if (((irqflags & IRQF_SHARED) && !dev_id) || ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) || (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (!irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; if (!handler) { if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->thread_fn = thread_fn; action->flags = irqflags; action->name = devname; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action->secondary); kfree(action); } #ifdef CONFIG_DEBUG_SHIRQ_FIXME if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... * We disable the irq to make sure that a 'real' IRQ doesn't * run in parallel with our fake. */ unsigned long flags; disable_irq(irq); local_irq_save(flags); handler(irq, dev_id); local_irq_restore(flags); enable_irq(irq); } #endif return retval; } EXPORT_SYMBOL(request_threaded_irq); /** * request_any_context_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @flags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. It selects either a hardirq or threaded handling * method depending on the context. * * Returns: On failure, it returns a negative value. On success, it returns either * IRQC_IS_HARDIRQ or IRQC_IS_NESTED. */ int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { struct irq_desc *desc; int ret; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (irq_settings_is_nested_thread(desc)) { ret = request_threaded_irq(irq, NULL, handler, flags, name, dev_id); return !ret ? IRQC_IS_NESTED : ret; } ret = request_irq(irq, handler, flags, name, dev_id); return !ret ? IRQC_IS_HARDIRQ : ret; } EXPORT_SYMBOL_GPL(request_any_context_irq); /** * request_nmi - allocate an interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @irqflags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. It sets up the IRQ line to be handled as an NMI. * * An interrupt line delivering NMIs cannot be shared and IRQ handling * cannot be threaded. * * Interrupt lines requested for NMI delivering must produce per cpu * interrupts and have auto enabling setting disabled. * * @dev_id must be globally unique. Normally the address of the device data * structure is used as the cookie. Since the handler receives this value * it makes sense to use it. * * If the interrupt line cannot be used to deliver NMIs, function will fail * and return a negative value. */ int request_nmi(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *name, void *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* NMI cannot be shared, used for Polling */ if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL)) return -EINVAL; if (!(irqflags & IRQF_PERCPU)) return -EINVAL; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || (irq_settings_can_autoenable(desc) && !(irqflags & IRQF_NO_AUTOEN)) || !irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc)) || !irq_supports_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; scoped_guard(raw_spinlock_irqsave, &desc->lock) { /* Setup NMI state */ desc->istate |= IRQS_NMI; retval = irq_nmi_setup(desc); if (retval) { __cleanup_nmi(irq, desc); return -EINVAL; } return 0; } err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } void enable_percpu_irq(unsigned int irq, unsigned int type) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { struct irq_desc *desc = scoped_irqdesc; /* * If the trigger type is not specified by the caller, then * use the default for this interrupt. */ type &= IRQ_TYPE_SENSE_MASK; if (type == IRQ_TYPE_NONE) type = irqd_get_trigger_type(&desc->irq_data); if (type != IRQ_TYPE_NONE) { if (__irq_set_trigger(desc, type)) { WARN(1, "failed to set type for IRQ%d\n", irq); return; } } irq_percpu_enable(desc, smp_processor_id()); } } EXPORT_SYMBOL_GPL(enable_percpu_irq); void enable_percpu_nmi(unsigned int irq, unsigned int type) { enable_percpu_irq(irq, type); } /** * irq_percpu_is_enabled - Check whether the per cpu irq is enabled * @irq: Linux irq number to check for * * Must be called from a non migratable context. Returns the enable * state of a per cpu interrupt on the current cpu. */ bool irq_percpu_is_enabled(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) return cpumask_test_cpu(smp_processor_id(), scoped_irqdesc->percpu_enabled); return false; } EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); void disable_percpu_irq(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) irq_percpu_disable(scoped_irqdesc, smp_processor_id()); } EXPORT_SYMBOL_GPL(disable_percpu_irq); void disable_percpu_nmi(unsigned int irq) { disable_percpu_irq(irq); } /* * Internal function to unregister a percpu irqaction. */ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) return NULL; scoped_guard(raw_spinlock_irqsave, &desc->lock) { action = desc->action; if (!action || action->percpu_dev_id != dev_id) { WARN(1, "Trying to free already-free IRQ %d\n", irq); return NULL; } if (!cpumask_empty(desc->percpu_enabled)) { WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq, cpumask_first(desc->percpu_enabled)); return NULL; } /* Found it - now remove it from the list of entries: */ desc->action = NULL; desc->istate &= ~IRQS_NMI; } unregister_handler_proc(irq, action); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; } /** * free_percpu_irq - free an interrupt allocated with request_percpu_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove a percpu interrupt handler. The handler is removed, but the * interrupt line is not disabled. This must be done on each CPU before * calling this function. The function does not return until any executing * interrupts for this IRQ have completed. * * This function must not be called from interrupt context. */ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; chip_bus_lock(desc); kfree(__free_percpu_irq(irq, dev_id)); chip_bus_sync_unlock(desc); } EXPORT_SYMBOL_GPL(free_percpu_irq); void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; if (WARN_ON(!irq_is_nmi(desc))) return; kfree(__free_percpu_irq(irq, dev_id)); } /** * setup_percpu_irq - setup a per-cpu interrupt * @irq: Interrupt line to setup * @act: irqaction for the interrupt * * Used to statically setup per-cpu interrupts in the early boot process. */ int setup_percpu_irq(unsigned int irq, struct irqaction *act) { struct irq_desc *desc = irq_to_desc(irq); int retval; if (!desc || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) return retval; retval = __setup_irq(irq, desc, act); if (retval) irq_chip_pm_put(&desc->irq_data); return retval; } /** * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt on the * local CPU. If the interrupt is supposed to be enabled on other CPUs, it * has to be done on each CPU using enable_percpu_irq(). * * @dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of * that variable. */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (!dev_id) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; if (flags && flags != IRQF_TIMER) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action); } return retval; } EXPORT_SYMBOL_GPL(__request_percpu_irq); /** * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @name: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs * have to be setup on each CPU by calling prepare_percpu_nmi() before * being enabled on the same CPU by using enable_percpu_nmi(). * * @dev_id must be globally unique. It is a per-cpu variable, and the * handler gets called with the interrupted CPU's instance of that * variable. * * Interrupt lines requested for NMI delivering should have auto enabling * setting disabled. * * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc) || irq_settings_can_autoenable(desc) || !irq_supports_nmi(desc)) return -EINVAL; /* The line cannot already be NMI */ if (irq_is_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; scoped_guard(raw_spinlock_irqsave, &desc->lock) desc->istate |= IRQS_NMI; return 0; err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } /** * prepare_percpu_nmi - performs CPU local setup for NMI delivery * @irq: Interrupt line to prepare for NMI delivery * * This call prepares an interrupt line to deliver NMI on the current CPU, * before that interrupt line gets enabled with enable_percpu_nmi(). * * As a CPU local operation, this should be called from non-preemptible * context. * * If the interrupt line cannot be used to deliver NMIs, function will fail * returning a negative value. */ int prepare_percpu_nmi(unsigned int irq) { int ret = -EINVAL; WARN_ON(preemptible()); scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { if (WARN(!irq_is_nmi(scoped_irqdesc), "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq)) return -EINVAL; ret = irq_nmi_setup(scoped_irqdesc); if (ret) pr_err("Failed to setup NMI delivery: irq %u\n", irq); } return ret; } /** * teardown_percpu_nmi - undoes NMI setup of IRQ line * @irq: Interrupt line from which CPU local NMI configuration should be removed * * This call undoes the setup done by prepare_percpu_nmi(). * * IRQ line should not be enabled for the current CPU. * As a CPU local operation, this should be called from non-preemptible * context. */ void teardown_percpu_nmi(unsigned int irq) { WARN_ON(preemptible()); scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { if (WARN_ON(!irq_is_nmi(scoped_irqdesc))) return; irq_nmi_teardown(scoped_irqdesc); } } static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) return -ENODEV; if (chip->irq_get_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY data = data->parent_data; #else data = NULL; #endif } while (data); if (data) err = chip->irq_get_irqchip_state(data, which, state); return err; } /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: One of IRQCHIP_STATE_* the caller wants to know about * @state: a pointer to a boolean where the state is to be stored * * This call snapshots the internal irqchip state of an interrupt, * returning into @state the bit corresponding to stage @which * * This function should be called with preemption disabled if the interrupt * controller has per-cpu registers. */ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state) { scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); return __irq_get_irqchip_state(data, which, state); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** * irq_set_irqchip_state - set the state of a forwarded interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: State to be restored (one of IRQCHIP_STATE_*) * @val: Value corresponding to @which * * This call sets the internal irqchip state of an interrupt, depending on * the value of @which. * * This function should be called with migration disabled if the interrupt * controller has per-cpu registers. */ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val) { scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); struct irq_chip *chip; do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) return -ENODEV; if (chip->irq_set_irqchip_state) break; data = irqd_get_parent_data(data); } while (data); if (data) return chip->irq_set_irqchip_state(data, which, val); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); /** * irq_has_action - Check whether an interrupt is requested * @irq: The linux irq number * * Returns: A snapshot of the current state */ bool irq_has_action(unsigned int irq) { bool res; rcu_read_lock(); res = irq_desc_has_action(irq_to_desc(irq)); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_has_action); /** * irq_check_status_bit - Check whether bits in the irq descriptor status are set * @irq: The linux irq number * @bitmask: The bitmask to evaluate * * Returns: True if one of the bits in @bitmask is set */ bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) { struct irq_desc *desc; bool res = false; rcu_read_lock(); desc = irq_to_desc(irq); if (desc) res = !!(desc->status_use_accessors & bitmask); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_check_status_bit);
53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMERQUEUE_H #define _LINUX_TIMERQUEUE_H #include <linux/rbtree.h> #include <linux/timerqueue_types.h> extern bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node); extern bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node); extern struct timerqueue_node *timerqueue_iterate_next( struct timerqueue_node *node); /** * timerqueue_getnext - Returns the timer with the earliest expiration time * * @head: head of timerqueue * * Returns a pointer to the timer node that has the earliest expiration time. */ static inline struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) { struct rb_node *leftmost = rb_first_cached(&head->rb_root); return rb_entry_safe(leftmost, struct timerqueue_node, node); } static inline void timerqueue_init(struct timerqueue_node *node) { RB_CLEAR_NODE(&node->node); } static inline bool timerqueue_node_queued(struct timerqueue_node *node) { return !RB_EMPTY_NODE(&node->node); } static inline void timerqueue_init_head(struct timerqueue_head *head) { head->rb_root = RB_ROOT_CACHED; } #endif /* _LINUX_TIMERQUEUE_H */
199 8 9 9 9 9 640 638 637 294 294 294 294 290 290 290 5 5 5 5 5 5 5 5 5 5 5 5 2 5 5 2 5 5 5 5 5 5 5 5 243 243 243 243 1 243 243 117 117 5 5 5 5 5 3 3 1 5 5 5 4 5 5 5 9 9 1 1 1 9 7 276 276 7 273 200 200 5 5 200 2 2 2 2 1 1 1 1 7 2 1 6 5 5 5 5 5 3 7 3 3 3 1 3 6 2 1 5 4 4 4 4 3 3 6 7 1 6 4 3 2 3 3 2 7 8 2 2 1 7 5 3 3 3 3 2 1 1 1 2 8 4 3 3 4 3 3 4 140 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright 2007 Hewlett-Packard Development Company, L.P. * * This file is part of the SCTP kernel implementation * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Vlad Yasevich <vladislav.yasevich@hp.com> */ #include <crypto/sha1.h> #include <crypto/sha2.h> #include <linux/slab.h> #include <linux/types.h> #include <net/sctp/sctp.h> #include <net/sctp/auth.h> static const struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { { /* id 0 is reserved. as all 0 */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0, }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA1, .hmac_len = SHA1_DIGEST_SIZE, }, { /* id 2 is reserved as well */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2, }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, .hmac_len = SHA256_DIGEST_SIZE, } }; static bool sctp_hmac_supported(__u16 hmac_id) { return hmac_id < ARRAY_SIZE(sctp_hmac_list) && sctp_hmac_list[hmac_id].hmac_len != 0; } void sctp_auth_key_put(struct sctp_auth_bytes *key) { if (!key) return; if (refcount_dec_and_test(&key->refcnt)) { kfree_sensitive(key); SCTP_DBG_OBJCNT_DEC(keys); } } /* Create a new key structure of a given length */ static struct sctp_auth_bytes *sctp_auth_create_key(__u32 key_len, gfp_t gfp) { struct sctp_auth_bytes *key; /* Verify that we are not going to overflow INT_MAX */ if (key_len > (INT_MAX - sizeof(struct sctp_auth_bytes))) return NULL; /* Allocate the shared key */ key = kmalloc(sizeof(struct sctp_auth_bytes) + key_len, gfp); if (!key) return NULL; key->len = key_len; refcount_set(&key->refcnt, 1); SCTP_DBG_OBJCNT_INC(keys); return key; } /* Create a new shared key container with a give key id */ struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp) { struct sctp_shared_key *new; /* Allocate the shared key container */ new = kzalloc(sizeof(struct sctp_shared_key), gfp); if (!new) return NULL; INIT_LIST_HEAD(&new->key_list); refcount_set(&new->refcnt, 1); new->key_id = key_id; return new; } /* Free the shared key structure */ static void sctp_auth_shkey_destroy(struct sctp_shared_key *sh_key) { BUG_ON(!list_empty(&sh_key->key_list)); sctp_auth_key_put(sh_key->key); sh_key->key = NULL; kfree(sh_key); } void sctp_auth_shkey_release(struct sctp_shared_key *sh_key) { if (refcount_dec_and_test(&sh_key->refcnt)) sctp_auth_shkey_destroy(sh_key); } void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key) { refcount_inc(&sh_key->refcnt); } /* Destroy the entire key list. This is done during the * associon and endpoint free process. */ void sctp_auth_destroy_keys(struct list_head *keys) { struct sctp_shared_key *ep_key; struct sctp_shared_key *tmp; if (list_empty(keys)) return; key_for_each_safe(ep_key, tmp, keys) { list_del_init(&ep_key->key_list); sctp_auth_shkey_release(ep_key); } } /* Compare two byte vectors as numbers. Return values * are: * 0 - vectors are equal * < 0 - vector 1 is smaller than vector2 * > 0 - vector 1 is greater than vector2 * * Algorithm is: * This is performed by selecting the numerically smaller key vector... * If the key vectors are equal as numbers but differ in length ... * the shorter vector is considered smaller * * Examples (with small values): * 000123456789 > 123456789 (first number is longer) * 000123456789 < 234567891 (second number is larger numerically) * 123456789 > 2345678 (first number is both larger & longer) */ static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1, struct sctp_auth_bytes *vector2) { int diff; int i; const __u8 *longer; diff = vector1->len - vector2->len; if (diff) { longer = (diff > 0) ? vector1->data : vector2->data; /* Check to see if the longer number is * lead-zero padded. If it is not, it * is automatically larger numerically. */ for (i = 0; i < abs(diff); i++) { if (longer[i] != 0) return diff; } } /* lengths are the same, compare numbers */ return memcmp(vector1->data, vector2->data, vector1->len); } /* * Create a key vector as described in SCTP-AUTH, Section 6.1 * The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO * parameter sent by each endpoint are concatenated as byte vectors. * These parameters include the parameter type, parameter length, and * the parameter value, but padding is omitted; all padding MUST be * removed from this concatenation before proceeding with further * computation of keys. Parameters which were not sent are simply * omitted from the concatenation process. The resulting two vectors * are called the two key vectors. */ static struct sctp_auth_bytes *sctp_auth_make_key_vector( struct sctp_random_param *random, struct sctp_chunks_param *chunks, struct sctp_hmac_algo_param *hmacs, gfp_t gfp) { struct sctp_auth_bytes *new; __u32 len; __u32 offset = 0; __u16 random_len, hmacs_len, chunks_len = 0; random_len = ntohs(random->param_hdr.length); hmacs_len = ntohs(hmacs->param_hdr.length); if (chunks) chunks_len = ntohs(chunks->param_hdr.length); len = random_len + hmacs_len + chunks_len; new = sctp_auth_create_key(len, gfp); if (!new) return NULL; memcpy(new->data, random, random_len); offset += random_len; if (chunks) { memcpy(new->data + offset, chunks, chunks_len); offset += chunks_len; } memcpy(new->data + offset, hmacs, hmacs_len); return new; } /* Make a key vector based on our local parameters */ static struct sctp_auth_bytes *sctp_auth_make_local_vector( const struct sctp_association *asoc, gfp_t gfp) { return sctp_auth_make_key_vector( (struct sctp_random_param *)asoc->c.auth_random, (struct sctp_chunks_param *)asoc->c.auth_chunks, (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs, gfp); } /* Make a key vector based on peer's parameters */ static struct sctp_auth_bytes *sctp_auth_make_peer_vector( const struct sctp_association *asoc, gfp_t gfp) { return sctp_auth_make_key_vector(asoc->peer.peer_random, asoc->peer.peer_chunks, asoc->peer.peer_hmacs, gfp); } /* Set the value of the association shared key base on the parameters * given. The algorithm is: * From the endpoint pair shared keys and the key vectors the * association shared keys are computed. This is performed by selecting * the numerically smaller key vector and concatenating it to the * endpoint pair shared key, and then concatenating the numerically * larger key vector to that. The result of the concatenation is the * association shared key. */ static struct sctp_auth_bytes *sctp_auth_asoc_set_secret( struct sctp_shared_key *ep_key, struct sctp_auth_bytes *first_vector, struct sctp_auth_bytes *last_vector, gfp_t gfp) { struct sctp_auth_bytes *secret; __u32 offset = 0; __u32 auth_len; auth_len = first_vector->len + last_vector->len; if (ep_key->key) auth_len += ep_key->key->len; secret = sctp_auth_create_key(auth_len, gfp); if (!secret) return NULL; if (ep_key->key) { memcpy(secret->data, ep_key->key->data, ep_key->key->len); offset += ep_key->key->len; } memcpy(secret->data + offset, first_vector->data, first_vector->len); offset += first_vector->len; memcpy(secret->data + offset, last_vector->data, last_vector->len); return secret; } /* Create an association shared key. Follow the algorithm * described in SCTP-AUTH, Section 6.1 */ static struct sctp_auth_bytes *sctp_auth_asoc_create_secret( const struct sctp_association *asoc, struct sctp_shared_key *ep_key, gfp_t gfp) { struct sctp_auth_bytes *local_key_vector; struct sctp_auth_bytes *peer_key_vector; struct sctp_auth_bytes *first_vector, *last_vector; struct sctp_auth_bytes *secret = NULL; int cmp; /* Now we need to build the key vectors * SCTP-AUTH , Section 6.1 * The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO * parameter sent by each endpoint are concatenated as byte vectors. * These parameters include the parameter type, parameter length, and * the parameter value, but padding is omitted; all padding MUST be * removed from this concatenation before proceeding with further * computation of keys. Parameters which were not sent are simply * omitted from the concatenation process. The resulting two vectors * are called the two key vectors. */ local_key_vector = sctp_auth_make_local_vector(asoc, gfp); peer_key_vector = sctp_auth_make_peer_vector(asoc, gfp); if (!peer_key_vector || !local_key_vector) goto out; /* Figure out the order in which the key_vectors will be * added to the endpoint shared key. * SCTP-AUTH, Section 6.1: * This is performed by selecting the numerically smaller key * vector and concatenating it to the endpoint pair shared * key, and then concatenating the numerically larger key * vector to that. If the key vectors are equal as numbers * but differ in length, then the concatenation order is the * endpoint shared key, followed by the shorter key vector, * followed by the longer key vector. Otherwise, the key * vectors are identical, and may be concatenated to the * endpoint pair key in any order. */ cmp = sctp_auth_compare_vectors(local_key_vector, peer_key_vector); if (cmp < 0) { first_vector = local_key_vector; last_vector = peer_key_vector; } else { first_vector = peer_key_vector; last_vector = local_key_vector; } secret = sctp_auth_asoc_set_secret(ep_key, first_vector, last_vector, gfp); out: sctp_auth_key_put(local_key_vector); sctp_auth_key_put(peer_key_vector); return secret; } /* * Populate the association overlay list with the list * from the endpoint. */ int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep, struct sctp_association *asoc, gfp_t gfp) { struct sctp_shared_key *sh_key; struct sctp_shared_key *new; BUG_ON(!list_empty(&asoc->endpoint_shared_keys)); key_for_each(sh_key, &ep->endpoint_shared_keys) { new = sctp_auth_shkey_create(sh_key->key_id, gfp); if (!new) goto nomem; new->key = sh_key->key; sctp_auth_key_hold(new->key); list_add(&new->key_list, &asoc->endpoint_shared_keys); } return 0; nomem: sctp_auth_destroy_keys(&asoc->endpoint_shared_keys); return -ENOMEM; } /* Public interface to create the association shared key. * See code above for the algorithm. */ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp) { struct sctp_auth_bytes *secret; struct sctp_shared_key *ep_key; struct sctp_chunk *chunk; /* If we don't support AUTH, or peer is not capable * we don't need to do anything. */ if (!asoc->peer.auth_capable) return 0; /* If the key_id is non-zero and we couldn't find an * endpoint pair shared key, we can't compute the * secret. * For key_id 0, endpoint pair shared key is a NULL key. */ ep_key = sctp_auth_get_shkey(asoc, asoc->active_key_id); BUG_ON(!ep_key); secret = sctp_auth_asoc_create_secret(asoc, ep_key, gfp); if (!secret) return -ENOMEM; sctp_auth_key_put(asoc->asoc_shared_key); asoc->asoc_shared_key = secret; asoc->shkey = ep_key; /* Update send queue in case any chunk already in there now * needs authenticating */ list_for_each_entry(chunk, &asoc->outqueue.out_chunk_list, list) { if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc)) { chunk->auth = 1; if (!chunk->shkey) { chunk->shkey = asoc->shkey; sctp_auth_shkey_hold(chunk->shkey); } } } return 0; } /* Find the endpoint pair shared key based on the key_id */ struct sctp_shared_key *sctp_auth_get_shkey( const struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; /* First search associations set of endpoint pair shared keys */ key_for_each(key, &asoc->endpoint_shared_keys) { if (key->key_id == key_id) { if (!key->deactivated) return key; break; } } return NULL; } const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) { return &sctp_hmac_list[hmac_id]; } /* Get an hmac description information that we can use to build * the AUTH chunk */ const struct sctp_hmac * sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) { struct sctp_hmac_algo_param *hmacs; __u16 n_elt; __u16 id = 0; int i; /* If we have a default entry, use it */ if (asoc->default_hmac_id) return &sctp_hmac_list[asoc->default_hmac_id]; /* Since we do not have a default entry, find the first entry * we support and return that. Do not cache that id. */ hmacs = asoc->peer.peer_hmacs; if (!hmacs) return NULL; n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); if (sctp_hmac_supported(id)) return &sctp_hmac_list[id]; } return NULL; } static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id) { int found = 0; int i; for (i = 0; i < n_elts; i++) { if (hmac_id == hmacs[i]) { found = 1; break; } } return found; } /* See if the HMAC_ID is one that we claim as supported */ int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, __be16 hmac_id) { struct sctp_hmac_algo_param *hmacs; __u16 n_elt; if (!asoc) return 0; hmacs = (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs; n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; return __sctp_auth_find_hmacid(hmacs->hmac_ids, n_elt, hmac_id); } /* Cache the default HMAC id. This to follow this text from SCTP-AUTH: * Section 6.1: * The receiver of a HMAC-ALGO parameter SHOULD use the first listed * algorithm it supports. */ void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs) { __u16 id; int i; int n_params; /* if the default id is already set, use it */ if (asoc->default_hmac_id) return; n_params = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; for (i = 0; i < n_params; i++) { id = ntohs(hmacs->hmac_ids[i]); if (sctp_hmac_supported(id)) { asoc->default_hmac_id = id; break; } } } /* Check to see if the given chunk is supposed to be authenticated */ static int __sctp_auth_cid(enum sctp_cid chunk, struct sctp_chunks_param *param) { unsigned short len; int found = 0; int i; if (!param || param->param_hdr.length == 0) return 0; len = ntohs(param->param_hdr.length) - sizeof(struct sctp_paramhdr); /* SCTP-AUTH, Section 3.2 * The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH * chunks MUST NOT be listed in the CHUNKS parameter. However, if * a CHUNKS parameter is received then the types for INIT, INIT-ACK, * SHUTDOWN-COMPLETE and AUTH chunks MUST be ignored. */ for (i = 0; !found && i < len; i++) { switch (param->chunks[i]) { case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: case SCTP_CID_AUTH: break; default: if (param->chunks[i] == chunk) found = 1; break; } } return found; } /* Check if peer requested that this chunk is authenticated */ int sctp_auth_send_cid(enum sctp_cid chunk, const struct sctp_association *asoc) { if (!asoc) return 0; if (!asoc->peer.auth_capable) return 0; return __sctp_auth_cid(chunk, asoc->peer.peer_chunks); } /* Check if we requested that peer authenticate this chunk. */ int sctp_auth_recv_cid(enum sctp_cid chunk, const struct sctp_association *asoc) { if (!asoc) return 0; if (!asoc->peer.auth_capable) return 0; return __sctp_auth_cid(chunk, (struct sctp_chunks_param *)asoc->c.auth_chunks); } /* SCTP-AUTH: Section 6.2: * The sender MUST calculate the MAC as described in RFC2104 [2] using * the hash function H as described by the MAC Identifier and the shared * association key K based on the endpoint pair shared key described by * the shared key identifier. The 'data' used for the computation of * the AUTH-chunk is given by the AUTH chunk with its HMAC field set to * zero (as shown in Figure 6) followed by all chunks that are placed * after the AUTH chunk in the SCTP packet. */ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, struct sk_buff *skb, struct sctp_auth_chunk *auth, struct sctp_shared_key *ep_key, gfp_t gfp) { struct sctp_auth_bytes *asoc_key; __u16 key_id, hmac_id; int free_key = 0; size_t data_len; __u8 *digest; /* Extract the info we need: * - hmac id * - key id */ key_id = ntohs(auth->auth_hdr.shkey_id); hmac_id = ntohs(auth->auth_hdr.hmac_id); if (key_id == asoc->active_key_id) asoc_key = asoc->asoc_shared_key; else { /* ep_key can't be NULL here */ asoc_key = sctp_auth_asoc_create_secret(asoc, ep_key, gfp); if (!asoc_key) return; free_key = 1; } data_len = skb_tail_pointer(skb) - (unsigned char *)auth; digest = (u8 *)(&auth->auth_hdr + 1); if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1) { hmac_sha1_usingrawkey(asoc_key->data, asoc_key->len, (const u8 *)auth, data_len, digest); } else { WARN_ON_ONCE(hmac_id != SCTP_AUTH_HMAC_ID_SHA256); hmac_sha256_usingrawkey(asoc_key->data, asoc_key->len, (const u8 *)auth, data_len, digest); } if (free_key) sctp_auth_key_put(asoc_key); } /* API Helpers */ /* Add a chunk to the endpoint authenticated chunk list */ int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id) { struct sctp_chunks_param *p = ep->auth_chunk_list; __u16 nchunks; __u16 param_len; /* If this chunk is already specified, we are done */ if (__sctp_auth_cid(chunk_id, p)) return 0; /* Check if we can add this chunk to the array */ param_len = ntohs(p->param_hdr.length); nchunks = param_len - sizeof(struct sctp_paramhdr); if (nchunks == SCTP_NUM_CHUNK_TYPES) return -EINVAL; p->chunks[nchunks] = chunk_id; p->param_hdr.length = htons(param_len + 1); return 0; } /* Add hmac identifires to the endpoint list of supported hmac ids */ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, struct sctp_hmacalgo *hmacs) { int has_sha1 = 0; __u16 id; int i; /* Scan the list looking for unsupported id. Also make sure that * SHA1 is specified. */ for (i = 0; i < hmacs->shmac_num_idents; i++) { id = hmacs->shmac_idents[i]; if (!sctp_hmac_supported(id)) return -EOPNOTSUPP; if (SCTP_AUTH_HMAC_ID_SHA1 == id) has_sha1 = 1; } if (!has_sha1) return -EINVAL; for (i = 0; i < hmacs->shmac_num_idents; i++) ep->auth_hmacs_list->hmac_ids[i] = htons(hmacs->shmac_idents[i]); ep->auth_hmacs_list->param_hdr.length = htons(sizeof(struct sctp_paramhdr) + hmacs->shmac_num_idents * sizeof(__u16)); return 0; } /* Set a new shared key on either endpoint or association. If the * key with a same ID already exists, replace the key (remove the * old key and add a new one). */ int sctp_auth_set_key(struct sctp_endpoint *ep, struct sctp_association *asoc, struct sctp_authkey *auth_key) { struct sctp_shared_key *cur_key, *shkey; struct sctp_auth_bytes *key; struct list_head *sh_keys; int replace = 0; /* Try to find the given key id to see if * we are doing a replace, or adding a new key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; sh_keys = &ep->endpoint_shared_keys; } key_for_each(shkey, sh_keys) { if (shkey->key_id == auth_key->sca_keynumber) { replace = 1; break; } } cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber, GFP_KERNEL); if (!cur_key) return -ENOMEM; /* Create a new key data based on the info passed in */ key = sctp_auth_create_key(auth_key->sca_keylength, GFP_KERNEL); if (!key) { kfree(cur_key); return -ENOMEM; } memcpy(key->data, &auth_key->sca_key[0], auth_key->sca_keylength); cur_key->key = key; if (!replace) { list_add(&cur_key->key_list, sh_keys); return 0; } list_del_init(&shkey->key_list); list_add(&cur_key->key_list, sh_keys); if (asoc && asoc->active_key_id == auth_key->sca_keynumber && sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) { list_del_init(&cur_key->key_list); sctp_auth_shkey_release(cur_key); list_add(&shkey->key_list, sh_keys); return -ENOMEM; } sctp_auth_shkey_release(shkey); return 0; } int sctp_auth_set_active_key(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; struct list_head *sh_keys; int found = 0; /* The key identifier MUST correst to an existing key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; sh_keys = &ep->endpoint_shared_keys; } key_for_each(key, sh_keys) { if (key->key_id == key_id) { found = 1; break; } } if (!found || key->deactivated) return -EINVAL; if (asoc) { __u16 active_key_id = asoc->active_key_id; asoc->active_key_id = key_id; if (sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) { asoc->active_key_id = active_key_id; return -ENOMEM; } } else ep->active_key_id = key_id; return 0; } int sctp_auth_del_key_id(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; struct list_head *sh_keys; int found = 0; /* The key identifier MUST NOT be the current active key * The key identifier MUST correst to an existing key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; if (asoc->active_key_id == key_id) return -EINVAL; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; if (ep->active_key_id == key_id) return -EINVAL; sh_keys = &ep->endpoint_shared_keys; } key_for_each(key, sh_keys) { if (key->key_id == key_id) { found = 1; break; } } if (!found) return -EINVAL; /* Delete the shared key */ list_del_init(&key->key_list); sctp_auth_shkey_release(key); return 0; } int sctp_auth_deact_key_id(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; struct list_head *sh_keys; int found = 0; /* The key identifier MUST NOT be the current active key * The key identifier MUST correst to an existing key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; if (asoc->active_key_id == key_id) return -EINVAL; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; if (ep->active_key_id == key_id) return -EINVAL; sh_keys = &ep->endpoint_shared_keys; } key_for_each(key, sh_keys) { if (key->key_id == key_id) { found = 1; break; } } if (!found) return -EINVAL; /* refcnt == 1 and !list_empty mean it's not being used anywhere * and deactivated will be set, so it's time to notify userland * that this shkey can be freed. */ if (asoc && !list_empty(&key->key_list) && refcount_read(&key->refcnt) == 1) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, key->key_id, SCTP_AUTH_FREE_KEY, GFP_KERNEL); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } key->deactivated = 1; return 0; } int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) { /* Allocate space for HMACS and CHUNKS authentication * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. */ if (!ep->auth_hmacs_list) { struct sctp_hmac_algo_param *auth_hmacs; auth_hmacs = kzalloc(struct_size(auth_hmacs, hmac_ids, SCTP_AUTH_NUM_HMACS), gfp); if (!auth_hmacs) goto nomem; /* Initialize the HMACS parameter. * SCTP-AUTH: Section 3.3 * Every endpoint supporting SCTP chunk authentication MUST * support the HMAC based on the SHA-1 algorithm. */ auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO; auth_hmacs->param_hdr.length = htons(sizeof(struct sctp_paramhdr) + 2); auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1); ep->auth_hmacs_list = auth_hmacs; } if (!ep->auth_chunk_list) { struct sctp_chunks_param *auth_chunks; auth_chunks = kzalloc(sizeof(*auth_chunks) + SCTP_NUM_CHUNK_TYPES, gfp); if (!auth_chunks) goto nomem; /* Initialize the CHUNKS parameter */ auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS; auth_chunks->param_hdr.length = htons(sizeof(struct sctp_paramhdr)); ep->auth_chunk_list = auth_chunks; } return 0; nomem: /* Free all allocations */ kfree(ep->auth_hmacs_list); kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; return -ENOMEM; } void sctp_auth_free(struct sctp_endpoint *ep) { kfree(ep->auth_hmacs_list); kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> * * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU LESSER GENERAL PUBLIC LICENSE * version 2.1 as published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU LESSER GENERAL PUBLIC LICENSE for more details. * */ #ifndef mISDNIF_H #define mISDNIF_H #include <linux/types.h> #include <linux/errno.h> #include <linux/socket.h> /* * ABI Version 32 bit * * <8 bit> Major version * - changed if any interface become backwards incompatible * * <8 bit> Minor version * - changed if any interface is extended but backwards compatible * * <16 bit> Release number * - should be incremented on every checkin */ #define MISDN_MAJOR_VERSION 1 #define MISDN_MINOR_VERSION 1 #define MISDN_RELEASE 29 /* primitives for information exchange * generell format * <16 bit 0 > * <8 bit command> * BIT 8 = 1 LAYER private * BIT 7 = 1 answer * BIT 6 = 1 DATA * <8 bit target layer mask> * * Layer = 00 is reserved for general commands Layer = 01 L2 -> HW Layer = 02 HW -> L2 Layer = 04 L3 -> L2 Layer = 08 L2 -> L3 * Layer = FF is reserved for broadcast commands */ #define MISDN_CMDMASK 0xff00 #define MISDN_LAYERMASK 0x00ff /* generell commands */ #define OPEN_CHANNEL 0x0100 #define CLOSE_CHANNEL 0x0200 #define CONTROL_CHANNEL 0x0300 #define CHECK_DATA 0x0400 /* layer 2 -> layer 1 */ #define PH_ACTIVATE_REQ 0x0101 #define PH_DEACTIVATE_REQ 0x0201 #define PH_DATA_REQ 0x2001 #define MPH_ACTIVATE_REQ 0x0501 #define MPH_DEACTIVATE_REQ 0x0601 #define MPH_INFORMATION_REQ 0x0701 #define PH_CONTROL_REQ 0x0801 /* layer 1 -> layer 2 */ #define PH_ACTIVATE_IND 0x0102 #define PH_ACTIVATE_CNF 0x4102 #define PH_DEACTIVATE_IND 0x0202 #define PH_DEACTIVATE_CNF 0x4202 #define PH_DATA_IND 0x2002 #define PH_DATA_E_IND 0x3002 #define MPH_ACTIVATE_IND 0x0502 #define MPH_DEACTIVATE_IND 0x0602 #define MPH_INFORMATION_IND 0x0702 #define PH_DATA_CNF 0x6002 #define PH_CONTROL_IND 0x0802 #define PH_CONTROL_CNF 0x4802 /* layer 3 -> layer 2 */ #define DL_ESTABLISH_REQ 0x1004 #define DL_RELEASE_REQ 0x1104 #define DL_DATA_REQ 0x3004 #define DL_UNITDATA_REQ 0x3104 #define DL_INFORMATION_REQ 0x0004 /* layer 2 -> layer 3 */ #define DL_ESTABLISH_IND 0x1008 #define DL_ESTABLISH_CNF 0x5008 #define DL_RELEASE_IND 0x1108 #define DL_RELEASE_CNF 0x5108 #define DL_DATA_IND 0x3008 #define DL_UNITDATA_IND 0x3108 #define DL_INFORMATION_IND 0x0008 /* intern layer 2 management */ #define MDL_ASSIGN_REQ 0x1804 #define MDL_ASSIGN_IND 0x1904 #define MDL_REMOVE_REQ 0x1A04 #define MDL_REMOVE_IND 0x1B04 #define MDL_STATUS_UP_IND 0x1C04 #define MDL_STATUS_DOWN_IND 0x1D04 #define MDL_STATUS_UI_IND 0x1E04 #define MDL_ERROR_IND 0x1F04 #define MDL_ERROR_RSP 0x5F04 /* intern layer 2 */ #define DL_TIMER200_IND 0x7004 #define DL_TIMER203_IND 0x7304 #define DL_INTERN_MSG 0x7804 /* DL_INFORMATION_IND types */ #define DL_INFO_L2_CONNECT 0x0001 #define DL_INFO_L2_REMOVED 0x0002 /* PH_CONTROL types */ /* TOUCH TONE IS 0x20XX XX "0"..."9", "A","B","C","D","*","#" */ #define DTMF_TONE_VAL 0x2000 #define DTMF_TONE_MASK 0x007F #define DTMF_TONE_START 0x2100 #define DTMF_TONE_STOP 0x2200 #define DTMF_HFC_COEF 0x4000 #define DSP_CONF_JOIN 0x2403 #define DSP_CONF_SPLIT 0x2404 #define DSP_RECEIVE_OFF 0x2405 #define DSP_RECEIVE_ON 0x2406 #define DSP_ECHO_ON 0x2407 #define DSP_ECHO_OFF 0x2408 #define DSP_MIX_ON 0x2409 #define DSP_MIX_OFF 0x240a #define DSP_DELAY 0x240b #define DSP_JITTER 0x240c #define DSP_TXDATA_ON 0x240d #define DSP_TXDATA_OFF 0x240e #define DSP_TX_DEJITTER 0x240f #define DSP_TX_DEJ_OFF 0x2410 #define DSP_TONE_PATT_ON 0x2411 #define DSP_TONE_PATT_OFF 0x2412 #define DSP_VOL_CHANGE_TX 0x2413 #define DSP_VOL_CHANGE_RX 0x2414 #define DSP_BF_ENABLE_KEY 0x2415 #define DSP_BF_DISABLE 0x2416 #define DSP_BF_ACCEPT 0x2416 #define DSP_BF_REJECT 0x2417 #define DSP_PIPELINE_CFG 0x2418 #define HFC_VOL_CHANGE_TX 0x2601 #define HFC_VOL_CHANGE_RX 0x2602 #define HFC_SPL_LOOP_ON 0x2603 #define HFC_SPL_LOOP_OFF 0x2604 /* for T30 FAX and analog modem */ #define HW_MOD_FRM 0x4000 #define HW_MOD_FRH 0x4001 #define HW_MOD_FTM 0x4002 #define HW_MOD_FTH 0x4003 #define HW_MOD_FTS 0x4004 #define HW_MOD_CONNECT 0x4010 #define HW_MOD_OK 0x4011 #define HW_MOD_NOCARR 0x4012 #define HW_MOD_FCERROR 0x4013 #define HW_MOD_READY 0x4014 #define HW_MOD_LASTDATA 0x4015 /* DSP_TONE_PATT_ON parameter */ #define TONE_OFF 0x0000 #define TONE_GERMAN_DIALTONE 0x0001 #define TONE_GERMAN_OLDDIALTONE 0x0002 #define TONE_AMERICAN_DIALTONE 0x0003 #define TONE_GERMAN_DIALPBX 0x0004 #define TONE_GERMAN_OLDDIALPBX 0x0005 #define TONE_AMERICAN_DIALPBX 0x0006 #define TONE_GERMAN_RINGING 0x0007 #define TONE_GERMAN_OLDRINGING 0x0008 #define TONE_AMERICAN_RINGPBX 0x000b #define TONE_GERMAN_RINGPBX 0x000c #define TONE_GERMAN_OLDRINGPBX 0x000d #define TONE_AMERICAN_RINGING 0x000e #define TONE_GERMAN_BUSY 0x000f #define TONE_GERMAN_OLDBUSY 0x0010 #define TONE_AMERICAN_BUSY 0x0011 #define TONE_GERMAN_HANGUP 0x0012 #define TONE_GERMAN_OLDHANGUP 0x0013 #define TONE_AMERICAN_HANGUP 0x0014 #define TONE_SPECIAL_INFO 0x0015 #define TONE_GERMAN_GASSENBESETZT 0x0016 #define TONE_GERMAN_AUFSCHALTTON 0x0016 /* MPH_INFORMATION_IND */ #define L1_SIGNAL_LOS_OFF 0x0010 #define L1_SIGNAL_LOS_ON 0x0011 #define L1_SIGNAL_AIS_OFF 0x0012 #define L1_SIGNAL_AIS_ON 0x0013 #define L1_SIGNAL_RDI_OFF 0x0014 #define L1_SIGNAL_RDI_ON 0x0015 #define L1_SIGNAL_SLIP_RX 0x0020 #define L1_SIGNAL_SLIP_TX 0x0021 /* * protocol ids * D channel 1-31 * B channel 33 - 63 */ #define ISDN_P_NONE 0 #define ISDN_P_BASE 0 #define ISDN_P_TE_S0 0x01 #define ISDN_P_NT_S0 0x02 #define ISDN_P_TE_E1 0x03 #define ISDN_P_NT_E1 0x04 #define ISDN_P_TE_UP0 0x05 #define ISDN_P_NT_UP0 0x06 #define IS_ISDN_P_TE(p) ((p == ISDN_P_TE_S0) || (p == ISDN_P_TE_E1) || \ (p == ISDN_P_TE_UP0) || (p == ISDN_P_LAPD_TE)) #define IS_ISDN_P_NT(p) ((p == ISDN_P_NT_S0) || (p == ISDN_P_NT_E1) || \ (p == ISDN_P_NT_UP0) || (p == ISDN_P_LAPD_NT)) #define IS_ISDN_P_S0(p) ((p == ISDN_P_TE_S0) || (p == ISDN_P_NT_S0)) #define IS_ISDN_P_E1(p) ((p == ISDN_P_TE_E1) || (p == ISDN_P_NT_E1)) #define IS_ISDN_P_UP0(p) ((p == ISDN_P_TE_UP0) || (p == ISDN_P_NT_UP0)) #define ISDN_P_LAPD_TE 0x10 #define ISDN_P_LAPD_NT 0x11 #define ISDN_P_B_MASK 0x1f #define ISDN_P_B_START 0x20 #define ISDN_P_B_RAW 0x21 #define ISDN_P_B_HDLC 0x22 #define ISDN_P_B_X75SLP 0x23 #define ISDN_P_B_L2DTMF 0x24 #define ISDN_P_B_L2DSP 0x25 #define ISDN_P_B_L2DSPHDLC 0x26 #define ISDN_P_B_T30_FAX 0x27 #define ISDN_P_B_MODEM_ASYNC 0x28 #define OPTION_L2_PMX 1 #define OPTION_L2_PTP 2 #define OPTION_L2_FIXEDTEI 3 #define OPTION_L2_CLEANUP 4 #define OPTION_L1_HOLD 5 /* should be in sync with linux/kobject.h:KOBJ_NAME_LEN */ #define MISDN_MAX_IDLEN 20 struct mISDNhead { unsigned int prim; unsigned int id; } __packed; #define MISDN_HEADER_LEN sizeof(struct mISDNhead) #define MAX_DATA_SIZE 2048 #define MAX_DATA_MEM (MAX_DATA_SIZE + MISDN_HEADER_LEN) #define MAX_DFRAME_LEN 260 #define MISDN_ID_ADDR_MASK 0xFFFF #define MISDN_ID_TEI_MASK 0xFF00 #define MISDN_ID_SAPI_MASK 0x00FF #define MISDN_ID_TEI_ANY 0x7F00 #define MISDN_ID_ANY 0xFFFF #define MISDN_ID_NONE 0xFFFE #define GROUP_TEI 127 #define TEI_SAPI 63 #define CTRL_SAPI 0 #define MISDN_MAX_CHANNEL 127 #define MISDN_CHMAP_SIZE ((MISDN_MAX_CHANNEL + 1) >> 3) #define SOL_MISDN 0 struct sockaddr_mISDN { sa_family_t family; unsigned char dev; unsigned char channel; unsigned char sapi; unsigned char tei; }; struct mISDNversion { unsigned char major; unsigned char minor; unsigned short release; }; struct mISDN_devinfo { u_int id; u_int Dprotocols; u_int Bprotocols; u_int protocol; u_char channelmap[MISDN_CHMAP_SIZE]; u_int nrbchan; char name[MISDN_MAX_IDLEN]; }; struct mISDN_devrename { u_int id; char name[MISDN_MAX_IDLEN]; /* new name */ }; /* MPH_INFORMATION_REQ payload */ struct ph_info_ch { __u32 protocol; __u64 Flags; }; struct ph_info_dch { struct ph_info_ch ch; __u16 state; __u16 num_bch; }; struct ph_info { struct ph_info_dch dch; struct ph_info_ch bch[]; }; /* timer device ioctl */ #define IMADDTIMER _IOR('I', 64, int) #define IMDELTIMER _IOR('I', 65, int) /* socket ioctls */ #define IMGETVERSION _IOR('I', 66, int) #define IMGETCOUNT _IOR('I', 67, int) #define IMGETDEVINFO _IOR('I', 68, int) #define IMCTRLREQ _IOR('I', 69, int) #define IMCLEAR_L2 _IOR('I', 70, int) #define IMSETDEVNAME _IOR('I', 71, struct mISDN_devrename) #define IMHOLD_L1 _IOR('I', 72, int) static inline int test_channelmap(u_int nr, u_char *map) { if (nr <= MISDN_MAX_CHANNEL) return map[nr >> 3] & (1 << (nr & 7)); else return 0; } static inline void set_channelmap(u_int nr, u_char *map) { map[nr >> 3] |= (1 << (nr & 7)); } static inline void clear_channelmap(u_int nr, u_char *map) { map[nr >> 3] &= ~(1 << (nr & 7)); } /* CONTROL_CHANNEL parameters */ #define MISDN_CTRL_GETOP 0x0000 #define MISDN_CTRL_LOOP 0x0001 #define MISDN_CTRL_CONNECT 0x0002 #define MISDN_CTRL_DISCONNECT 0x0004 #define MISDN_CTRL_RX_BUFFER 0x0008 #define MISDN_CTRL_PCMCONNECT 0x0010 #define MISDN_CTRL_PCMDISCONNECT 0x0020 #define MISDN_CTRL_SETPEER 0x0040 #define MISDN_CTRL_UNSETPEER 0x0080 #define MISDN_CTRL_RX_OFF 0x0100 #define MISDN_CTRL_FILL_EMPTY 0x0200 #define MISDN_CTRL_GETPEER 0x0400 #define MISDN_CTRL_L1_TIMER3 0x0800 #define MISDN_CTRL_HW_FEATURES_OP 0x2000 #define MISDN_CTRL_HW_FEATURES 0x2001 #define MISDN_CTRL_HFC_OP 0x4000 #define MISDN_CTRL_HFC_PCM_CONN 0x4001 #define MISDN_CTRL_HFC_PCM_DISC 0x4002 #define MISDN_CTRL_HFC_CONF_JOIN 0x4003 #define MISDN_CTRL_HFC_CONF_SPLIT 0x4004 #define MISDN_CTRL_HFC_RECEIVE_OFF 0x4005 #define MISDN_CTRL_HFC_RECEIVE_ON 0x4006 #define MISDN_CTRL_HFC_ECHOCAN_ON 0x4007 #define MISDN_CTRL_HFC_ECHOCAN_OFF 0x4008 #define MISDN_CTRL_HFC_WD_INIT 0x4009 #define MISDN_CTRL_HFC_WD_RESET 0x400A /* special RX buffer value for MISDN_CTRL_RX_BUFFER request.p1 is the minimum * buffer size request.p2 the maximum. Using MISDN_CTRL_RX_SIZE_IGNORE will * not change the value, but still read back the actual stetting. */ #define MISDN_CTRL_RX_SIZE_IGNORE -1 /* socket options */ #define MISDN_TIME_STAMP 0x0001 struct mISDN_ctrl_req { int op; int channel; int p1; int p2; }; /* muxer options */ #define MISDN_OPT_ALL 1 #define MISDN_OPT_TEIMGR 2 #ifdef __KERNEL__ #include <linux/list.h> #include <linux/skbuff.h> #include <linux/net.h> #include <net/sock.h> #include <linux/completion.h> #define DEBUG_CORE 0x000000ff #define DEBUG_CORE_FUNC 0x00000002 #define DEBUG_SOCKET 0x00000004 #define DEBUG_MANAGER 0x00000008 #define DEBUG_SEND_ERR 0x00000010 #define DEBUG_MSG_THREAD 0x00000020 #define DEBUG_QUEUE_FUNC 0x00000040 #define DEBUG_L1 0x0000ff00 #define DEBUG_L1_FSM 0x00000200 #define DEBUG_L2 0x00ff0000 #define DEBUG_L2_FSM 0x00020000 #define DEBUG_L2_CTRL 0x00040000 #define DEBUG_L2_RECV 0x00080000 #define DEBUG_L2_TEI 0x00100000 #define DEBUG_L2_TEIFSM 0x00200000 #define DEBUG_TIMER 0x01000000 #define DEBUG_CLOCK 0x02000000 #define mISDN_HEAD_P(s) ((struct mISDNhead *)&s->cb[0]) #define mISDN_HEAD_PRIM(s) (((struct mISDNhead *)&s->cb[0])->prim) #define mISDN_HEAD_ID(s) (((struct mISDNhead *)&s->cb[0])->id) /* socket states */ #define MISDN_OPEN 1 #define MISDN_BOUND 2 #define MISDN_CLOSED 3 struct mISDNchannel; struct mISDNdevice; struct mISDNstack; struct mISDNclock; struct channel_req { u_int protocol; struct sockaddr_mISDN adr; struct mISDNchannel *ch; }; typedef int (ctrl_func_t)(struct mISDNchannel *, u_int, void *); typedef int (send_func_t)(struct mISDNchannel *, struct sk_buff *); typedef int (create_func_t)(struct channel_req *); struct Bprotocol { struct list_head list; char *name; u_int Bprotocols; create_func_t *create; }; struct mISDNchannel { struct list_head list; u_int protocol; u_int nr; u_long opt; u_int addr; struct mISDNstack *st; struct mISDNchannel *peer; send_func_t *send; send_func_t *recv; ctrl_func_t *ctrl; }; struct mISDN_sock_list { struct hlist_head head; rwlock_t lock; }; struct mISDN_sock { struct sock sk; struct mISDNchannel ch; u_int cmask; struct mISDNdevice *dev; }; struct mISDNdevice { struct mISDNchannel D; u_int id; u_int Dprotocols; u_int Bprotocols; u_int nrbchan; u_char channelmap[MISDN_CHMAP_SIZE]; struct list_head bchannels; struct mISDNchannel *teimgr; struct device dev; }; struct mISDNstack { u_long status; struct mISDNdevice *dev; struct task_struct *thread; struct completion *notify; wait_queue_head_t workq; struct sk_buff_head msgq; struct list_head layer2; struct mISDNchannel *layer1; struct mISDNchannel own; struct mutex lmutex; /* protect lists */ struct mISDN_sock_list l1sock; #ifdef MISDN_MSG_STATS u_int msg_cnt; u_int sleep_cnt; u_int stopped_cnt; #endif }; typedef int (clockctl_func_t)(void *, int); struct mISDNclock { struct list_head list; char name[64]; int pri; clockctl_func_t *ctl; void *priv; }; /* global alloc/queue functions */ static inline struct sk_buff * mI_alloc_skb(unsigned int len, gfp_t gfp_mask) { struct sk_buff *skb; skb = alloc_skb(len + MISDN_HEADER_LEN, gfp_mask); if (likely(skb)) skb_reserve(skb, MISDN_HEADER_LEN); return skb; } static inline struct sk_buff * _alloc_mISDN_skb(u_int prim, u_int id, u_int len, void *dp, gfp_t gfp_mask) { struct sk_buff *skb = mI_alloc_skb(len, gfp_mask); struct mISDNhead *hh; if (!skb) return NULL; if (len) skb_put_data(skb, dp, len); hh = mISDN_HEAD_P(skb); hh->prim = prim; hh->id = id; return skb; } static inline void _queue_data(struct mISDNchannel *ch, u_int prim, u_int id, u_int len, void *dp, gfp_t gfp_mask) { struct sk_buff *skb; if (!ch->peer) return; skb = _alloc_mISDN_skb(prim, id, len, dp, gfp_mask); if (!skb) return; if (ch->recv(ch->peer, skb)) dev_kfree_skb(skb); } /* global register/unregister functions */ extern int mISDN_register_device(struct mISDNdevice *, struct device *parent, char *name); extern void mISDN_unregister_device(struct mISDNdevice *); extern int mISDN_register_Bprotocol(struct Bprotocol *); extern void mISDN_unregister_Bprotocol(struct Bprotocol *); extern struct mISDNclock *mISDN_register_clock(char *, int, clockctl_func_t *, void *); extern void mISDN_unregister_clock(struct mISDNclock *); static inline struct mISDNdevice *dev_to_mISDN(const struct device *dev) { if (dev) return dev_get_drvdata(dev); else return NULL; } extern void set_channel_address(struct mISDNchannel *, u_int, u_int); extern void mISDN_clock_update(struct mISDNclock *, int, ktime_t *); extern unsigned short mISDN_clock_get(void); extern const char *mISDNDevName4ch(struct mISDNchannel *); #endif /* __KERNEL__ */ #endif /* mISDNIF_H */
6 6 6 6 6 6 6 6 6 6 6 6 5 5 5 6 6 6 347 347 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 /* * This file implement the Wireless Extensions proc API. * * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. * * (As all part of the Linux kernel, this file is GPL) */ /* * The /proc/net/wireless file is a human readable user-space interface * exporting various wireless specific statistics from the wireless devices. * This is the most popular part of the Wireless Extensions ;-) * * This interface is a pure clone of /proc/net/dev (in net/core/dev.c). * The content of the file is basically the content of "struct iw_statistics". */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/wireless.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <net/iw_handler.h> #include <net/wext.h> static void wireless_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { /* Get stats from the driver */ struct iw_statistics *stats = get_wireless_stats(dev); static struct iw_statistics nullstats = {}; /* show device if it's wireless regardless of current stats */ if (!stats) { #ifdef CONFIG_WIRELESS_EXT if (dev->wireless_handlers) stats = &nullstats; #endif #ifdef CONFIG_CFG80211 if (dev->ieee80211_ptr) stats = &nullstats; #endif } if (stats) { seq_printf(seq, "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d " "%6d %6d %6d\n", dev->name, stats->status, stats->qual.qual, stats->qual.updated & IW_QUAL_QUAL_UPDATED ? '.' : ' ', ((__s32) stats->qual.level) - ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0), stats->qual.updated & IW_QUAL_LEVEL_UPDATED ? '.' : ' ', ((__s32) stats->qual.noise) - ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0), stats->qual.updated & IW_QUAL_NOISE_UPDATED ? '.' : ' ', stats->discard.nwid, stats->discard.code, stats->discard.fragment, stats->discard.retries, stats->discard.misc, stats->miss.beacon); if (stats != &nullstats) stats->qual.updated &= ~IW_QUAL_ALL_UPDATED; } } /* ---------------------------------------------------------------- */ /* * Print info for /proc/net/wireless (print all entries) */ static int wireless_dev_seq_show(struct seq_file *seq, void *v) { might_sleep(); if (v == SEQ_START_TOKEN) seq_printf(seq, "Inter-| sta-| Quality | Discarded " "packets | Missed | WE\n" " face | tus | link level noise | nwid " "crypt frag retry misc | beacon | %d\n", WIRELESS_EXT); else wireless_seq_printf_stats(seq, v); return 0; } static void *wireless_dev_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); loff_t off; struct net_device *dev; rtnl_lock(); if (!*pos) return SEQ_START_TOKEN; off = 1; for_each_netdev(net, dev) if (off++ == *pos) return dev; return NULL; } static void *wireless_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); ++*pos; return v == SEQ_START_TOKEN ? first_net_device(net) : next_net_device(v); } static void wireless_dev_seq_stop(struct seq_file *seq, void *v) { rtnl_unlock(); } static const struct seq_operations wireless_seq_ops = { .start = wireless_dev_seq_start, .next = wireless_dev_seq_next, .stop = wireless_dev_seq_stop, .show = wireless_dev_seq_show, }; int __net_init wext_proc_init(struct net *net) { /* Create /proc/net/wireless entry */ if (!proc_create_net("wireless", 0444, net->proc_net, &wireless_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } void __net_exit wext_proc_exit(struct net *net) { remove_proc_entry("wireless", net->proc_net); }
1630 121 1632 953 121 121 121 328 88 328 3 3 199 314 313 2 2 2 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 // SPDX-License-Identifier: GPL-2.0+ /* * User-space Probes (UProbes) * * Copyright (C) IBM Corporation, 2008-2012 * Authors: * Srikar Dronamraju * Jim Keniston * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> #include <linux/highmem.h> #include <linux/pagemap.h> /* read_mapping_page */ #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ #include <linux/mmu_notifier.h> #include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ #include <linux/percpu-rwsem.h> #include <linux/task_work.h> #include <linux/shmem_fs.h> #include <linux/khugepaged.h> #include <linux/rcupdate_trace.h> #include <linux/workqueue.h> #include <linux/srcu.h> #include <linux/oom.h> /* check_stable_address_space */ #include <linux/pagewalk.h> #include <linux/uprobes.h> #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE static struct rb_root uprobes_tree = RB_ROOT; /* * allows us to skip the uprobe_mmap if there are no uprobe events active * at this time. Probably a fine grained per inode count is better? */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); /* Covers return_instance's uprobe lifetime. */ DEFINE_STATIC_SRCU(uretprobes_srcu); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ refcount_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ union { struct rcu_head rcu; struct work_struct work; }; loff_t offset; loff_t ref_ctr_offset; unsigned long flags; /* "unsigned long" so bitops work */ /* * The generic code assumes that it has two members of unknown type * owned by the arch-specific code: * * insn - copy_insn() saves the original instruction here for * arch_uprobe_analyze_insn(). * * ixol - potentially modified instruction to execute out of * line, copied to xol_area by xol_get_insn_slot(). */ struct arch_uprobe arch; }; struct delayed_uprobe { struct list_head list; struct uprobe *uprobe; struct mm_struct *mm; }; static DEFINE_MUTEX(delayed_uprobe_lock); static LIST_HEAD(delayed_uprobe_list); /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction * mangled by set_swbp(). * * On a breakpoint hit, thread contests for a slot. It frees the * slot after singlestep. Currently a fixed number of slots are * allocated. */ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ unsigned long *bitmap; /* 0 = free slot */ struct page *page; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ unsigned long vaddr; /* Page(s) of instruction slots */ }; static void uprobe_warn(struct task_struct *t, const char *msg) { pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg); } /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have * changed after breakpoint was inserted. * - is_register: indicates if we are in register context. * - Return 1 if the specified virtual address is in an * executable vma. */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE; if (is_register) flags |= VM_WRITE; return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC; } static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) { return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT); } static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) { return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); } /** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_swbp_insn * Returns true if @insn is a breakpoint instruction. */ bool __weak is_swbp_insn(uprobe_opcode_t *insn) { return *insn == UPROBE_SWBP_INSN; } /** * is_trap_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_trap_insn * Returns true if @insn is a breakpoint instruction. * * This function is needed for the case where an architecture has multiple * trap instructions (like powerpc). */ bool __weak is_trap_insn(uprobe_opcode_t *insn) { return is_swbp_insn(insn); } void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); kunmap_atomic(kaddr); } static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) { void *kaddr = kmap_atomic(page); memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); kunmap_atomic(kaddr); } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; /* * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. * We do not check if it is any other 'trap variant' which could * be conditional trap instruction such as the one powerpc supports. * * The logic is that we do not care if the underlying instruction * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { if (!is_swbp) /* unregister: was it changed by us? */ return 0; } return 1; } static struct delayed_uprobe * delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) { struct delayed_uprobe *du; list_for_each_entry(du, &delayed_uprobe_list, list) if (du->uprobe == uprobe && du->mm == mm) return du; return NULL; } static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) { struct delayed_uprobe *du; if (delayed_uprobe_check(uprobe, mm)) return 0; du = kzalloc(sizeof(*du), GFP_KERNEL); if (!du) return -ENOMEM; du->uprobe = uprobe; du->mm = mm; list_add(&du->list, &delayed_uprobe_list); return 0; } static void delayed_uprobe_delete(struct delayed_uprobe *du) { if (WARN_ON(!du)) return; list_del(&du->list); kfree(du); } static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) { struct list_head *pos, *q; struct delayed_uprobe *du; if (!uprobe && !mm) return; list_for_each_safe(pos, q, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (uprobe && du->uprobe != uprobe) continue; if (mm && du->mm != mm) continue; delayed_uprobe_delete(du); } } static bool valid_ref_ctr_vma(struct uprobe *uprobe, struct vm_area_struct *vma) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); return uprobe->ref_ctr_offset && vma->vm_file && file_inode(vma->vm_file) == uprobe->inode && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && vma->vm_start <= vaddr && vma->vm_end > vaddr; } static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; return NULL; } static int __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) { void *kaddr; struct page *page; int ret; short *ptr; if (!vaddr || !d) return -EINVAL; ret = get_user_pages_remote(mm, vaddr, 1, FOLL_WRITE, &page, NULL); if (unlikely(ret <= 0)) { /* * We are asking for 1 page. If get_user_pages_remote() fails, * it may return 0, in that case we have to return error. */ return ret == 0 ? -EBUSY : ret; } kaddr = kmap_atomic(page); ptr = kaddr + (vaddr & ~PAGE_MASK); if (unlikely(*ptr + d < 0)) { pr_warn("ref_ctr going negative. vaddr: 0x%lx, " "curr val: %d, delta: %d\n", vaddr, *ptr, d); ret = -EINVAL; goto out; } *ptr += d; ret = 0; out: kunmap_atomic(kaddr); put_page(page); return ret; } static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); } static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, short d) { struct vm_area_struct *rc_vma; unsigned long rc_vaddr; int ret = 0; rc_vma = find_ref_ctr_vma(uprobe, mm); if (rc_vma) { rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); ret = __update_ref_ctr(mm, rc_vaddr, d); if (ret) update_ref_ctr_warn(uprobe, mm, d); if (d > 0) return ret; } mutex_lock(&delayed_uprobe_lock); if (d > 0) ret = delayed_uprobe_add(uprobe, mm); else delayed_uprobe_remove(uprobe, mm); mutex_unlock(&delayed_uprobe_lock); return ret; } static bool orig_page_is_identical(struct vm_area_struct *vma, unsigned long vaddr, struct page *page, bool *pmd_mappable) { const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT; struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping, index); struct page *orig_page; bool identical; if (IS_ERR(orig_folio)) return false; orig_page = folio_file_page(orig_folio, index); *pmd_mappable = folio_test_pmd_mappable(orig_folio); identical = folio_test_uptodate(orig_folio) && pages_identical(page, orig_page); folio_put(orig_folio); return identical; } static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, bool is_register) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ if (fw->level != FW_LEVEL_PTE) return -EFAULT; /* * See can_follow_write_pte(): we'd actually prefer a writable PTE here, * but the VMA might not be writable. */ if (!pte_write(fw->pte)) { if (!PageAnonExclusive(fw->page)) return -EFAULT; if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) return -EFAULT; /* SOFTDIRTY is handled via pte_mkdirty() below. */ } /* * We'll temporarily unmap the page and flush the TLB, such that we can * modify the page atomically. */ flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); copy_to_page(fw->page, insn_vaddr, insn, nbytes); /* * When unregistering, we may only zap a PTE if uffd is disabled and * there are no unexpected folio references ... */ if (is_register || userfaultfd_missing(vma) || (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1)) goto remap; /* * ... and the mapped page is identical to the original page that * would get faulted in on next access. */ if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) goto remap; dec_mm_counter(vma->vm_mm, MM_ANONPAGES); folio_remove_rmap_pte(folio, fw->page, vma); if (!folio_mapped(folio) && folio_test_swapcache(folio) && folio_trylock(folio)) { folio_free_swap(folio); folio_unlock(folio); } folio_put(folio); return pmd_mappable; remap: /* * Make sure that our copy_to_page() changes become visible before the * set_pte_at() write. */ smp_wmb(); /* We modified the page. Make sure to mark the PTE dirty. */ set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); return 0; } /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction * supported by that architecture then we need to modify is_trap_at_addr and * uprobe_write_opcode accordingly. This would never be a problem for archs * that have fixed length instructions. * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. * @vma: the probed virtual memory area. * @opcode_vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @opcode_vaddr. * * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, uprobe_opcode_t opcode, bool is_register) { return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, void *data) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; int ret, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page; uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) return -EINVAL; /* * When registering, we have to break COW to get an exclusive anonymous * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really * required. Use FOLL_SPLIT_PMD, because __uprobe_write() * cannot deal with PMDs yet. */ if (is_register) gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; retry: ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); if (ret <= 0) goto out; folio = page_folio(page); ret = verify(page, insn_vaddr, insn, nbytes, data); if (ret <= 0) { folio_put(folio); goto out; } /* We are going to replace instruction, update ref_ctr. */ if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) { folio_put(folio); goto out; } ref_ctr_updated = 1; } ret = 0; if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) { VM_WARN_ON_ONCE(is_register); folio_put(folio); goto out; } if (!is_register) { /* * In the common case, we'll be able to zap the page when * unregistering. So trigger MMU notifiers now, as we won't * be able to do it under PTL. */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vaddr, vaddr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); } ret = -EAGAIN; /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); folio_walk_end(&fw, vma); } if (!is_register) mmu_notifier_invalidate_range_end(&range); folio_put(folio); switch (ret) { case -EFAULT: gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; fallthrough; case -EAGAIN: goto retry; default: break; } out: /* Revert back reference counter if instruction update failed. */ if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ if (ret > 0) collapse_pte_mapped_thp(mm, vaddr, false); return ret < 0 ? ret : 0; } /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. * @vma: the probed virtual memory area. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); } /** * set_orig_insn - Restore the original instruction. * @vma: the probed virtual memory area. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, false); } /* uprobe should have guaranteed positive refcount */ static struct uprobe *get_uprobe(struct uprobe *uprobe) { refcount_inc(&uprobe->ref); return uprobe; } /* * uprobe should have guaranteed lifetime, which can be either of: * - caller already has refcount taken (and wants an extra one); * - uprobe is RCU protected and won't be freed until after grace period; * - we are holding uprobes_treelock (for read or write, doesn't matter). */ static struct uprobe *try_get_uprobe(struct uprobe *uprobe) { if (refcount_inc_not_zero(&uprobe->ref)) return uprobe; return NULL; } static inline bool uprobe_is_active(struct uprobe *uprobe) { return !RB_EMPTY_NODE(&uprobe->rb_node); } static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); kfree(uprobe); } static void uprobe_free_srcu(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace); } static void uprobe_free_deferred(struct work_struct *work) { struct uprobe *uprobe = container_of(work, struct uprobe, work); write_lock(&uprobes_treelock); if (uprobe_is_active(uprobe)) { write_seqcount_begin(&uprobes_seqcount); rb_erase(&uprobe->rb_node, &uprobes_tree); write_seqcount_end(&uprobes_seqcount); } write_unlock(&uprobes_treelock); /* * If application munmap(exec_vma) before uprobe_unregister() * gets called, we don't get a chance to remove uprobe from * delayed_uprobe_list from remove_breakpoint(). Do it here. */ mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(uprobe, NULL); mutex_unlock(&delayed_uprobe_lock); /* start srcu -> rcu_tasks_trace -> kfree chain */ call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu); } static void put_uprobe(struct uprobe *uprobe) { if (!refcount_dec_and_test(&uprobe->ref)) return; INIT_WORK(&uprobe->work, uprobe_free_deferred); schedule_work(&uprobe->work); } /* Initialize hprobe as SRCU-protected "leased" uprobe */ static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx) { WARN_ON(!uprobe); hprobe->state = HPROBE_LEASED; hprobe->uprobe = uprobe; hprobe->srcu_idx = srcu_idx; } /* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */ static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe) { hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE; hprobe->uprobe = uprobe; hprobe->srcu_idx = -1; } /* * hprobe_consume() fetches hprobe's underlying uprobe and detects whether * uprobe is SRCU protected or is refcounted. hprobe_consume() can be * used only once for a given hprobe. * * Caller has to call hprobe_finalize() and pass previous hprobe_state, so * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever * is appropriate. */ static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate) { *hstate = xchg(&hprobe->state, HPROBE_CONSUMED); switch (*hstate) { case HPROBE_LEASED: case HPROBE_STABLE: return hprobe->uprobe; case HPROBE_GONE: /* uprobe is NULL, no SRCU */ case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ return NULL; default: WARN(1, "hprobe invalid state %d", *hstate); return NULL; } } /* * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. * hprobe_finalize() can only be used from current context after * hprobe_consume() call (which determines uprobe and hstate value). */ static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate) { switch (hstate) { case HPROBE_LEASED: __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); break; case HPROBE_STABLE: put_uprobe(hprobe->uprobe); break; case HPROBE_GONE: case HPROBE_CONSUMED: break; default: WARN(1, "hprobe invalid state %d", hstate); break; } } /* * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of * them can win the race to perform SRCU unlocking. Whoever wins must perform * SRCU unlock. * * Returns underlying valid uprobe or NULL, if there was no underlying uprobe * to begin with or we failed to bump its refcount and it's going away. * * Returned non-NULL uprobe can be still safely used within an ongoing SRCU * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has * an extra refcount for caller to assume and use. Otherwise, it's not * guaranteed that returned uprobe has a positive refcount, so caller has to * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current * SRCU lock region. See dup_utask(). */ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) { enum hprobe_state hstate; /* * Caller should guarantee that return_instance is not going to be * freed from under us. This can be achieved either through holding * rcu_read_lock() or by owning return_instance in the first place. * * Underlying uprobe is itself protected from reuse by SRCU, so ensure * SRCU lock is held properly. */ lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); hstate = READ_ONCE(hprobe->state); switch (hstate) { case HPROBE_STABLE: /* uprobe has positive refcount, bump refcount, if necessary */ return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; case HPROBE_GONE: /* * SRCU was unlocked earlier and we didn't manage to take * uprobe refcnt, so it's effectively NULL */ return NULL; case HPROBE_CONSUMED: /* * uprobe was consumed, so it's effectively NULL as far as * uretprobe processing logic is concerned */ return NULL; case HPROBE_LEASED: { struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); /* * Try to switch hprobe state, guarding against * hprobe_consume() or another hprobe_expire() racing with us. * Note, if we failed to get uprobe refcount, we use special * HPROBE_GONE state to signal that hprobe->uprobe shouldn't * be used as it will be freed after SRCU is unlocked. */ if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { /* We won the race, we are the ones to unlock SRCU */ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); return get ? get_uprobe(uprobe) : uprobe; } /* * We lost the race, undo refcount bump (if it ever happened), * unless caller would like an extra refcount anyways. */ if (uprobe && !get) put_uprobe(uprobe); /* * Even if hprobe_consume() or another hprobe_expire() wins * the state update race and unlocks SRCU from under us, we * still have a guarantee that underyling uprobe won't be * freed due to ongoing caller's SRCU lock region, so we can * return it regardless. Also, if `get` was true, we also have * an extra ref for the caller to own. This is used in dup_utask(). */ return uprobe; } default: WARN(1, "unknown hprobe state %d", hstate); return NULL; } } static __always_inline int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset, const struct uprobe *r) { if (l_inode < r->inode) return -1; if (l_inode > r->inode) return 1; if (l_offset < r->offset) return -1; if (l_offset > r->offset) return 1; return 0; } #define __node_2_uprobe(node) \ rb_entry((node), struct uprobe, rb_node) struct __uprobe_key { struct inode *inode; loff_t offset; }; static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b) { const struct __uprobe_key *a = key; return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b)); } static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) { struct uprobe *u = __node_2_uprobe(a); return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); } /* * Assumes being inside RCU protected region. * No refcount is taken on returned uprobe. */ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) { struct __uprobe_key key = { .inode = inode, .offset = offset, }; struct rb_node *node; unsigned int seq; lockdep_assert(rcu_read_lock_trace_held()); do { seq = read_seqcount_begin(&uprobes_seqcount); node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); /* * Lockless RB-tree lookups can result only in false negatives. * If the element is found, it is correct and can be returned * under RCU protection. If we find nothing, we need to * validate that seqcount didn't change. If it did, we have to * try again as we might have missed the element (false * negative). If seqcount is unchanged, search truly failed. */ if (node) return __node_2_uprobe(node); } while (read_seqcount_retry(&uprobes_seqcount, seq)); return NULL; } /* * Attempt to insert a new uprobe into uprobes_tree. * * If uprobe already exists (for given inode+offset), we just increment * refcount of previously existing uprobe. * * If not, a provided new instance of uprobe is inserted into the tree (with * assumed initial refcount == 1). * * In any case, we return a uprobe instance that ends up being in uprobes_tree. * Caller has to clean up new uprobe instance, if it ended up not being * inserted into the tree. * * We assume that uprobes_treelock is held for writing. */ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { struct rb_node *node; again: node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); if (node) { struct uprobe *u = __node_2_uprobe(node); if (!try_get_uprobe(u)) { rb_erase(node, &uprobes_tree); RB_CLEAR_NODE(&u->rb_node); goto again; } return u; } return uprobe; } /* * Acquire uprobes_treelock and insert uprobe into uprobes_tree * (or reuse existing one, see __insert_uprobe() comments above). */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; write_lock(&uprobes_treelock); write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); write_seqcount_end(&uprobes_seqcount); write_unlock(&uprobes_treelock); return u; } static void ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) { pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) cur_uprobe->ref_ctr_offset, (unsigned long long) uprobe->ref_ctr_offset); } static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, loff_t ref_ctr_offset) { struct uprobe *uprobe, *cur_uprobe; uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); if (!uprobe) return ERR_PTR(-ENOMEM); uprobe->inode = inode; uprobe->offset = offset; uprobe->ref_ctr_offset = ref_ctr_offset; INIT_LIST_HEAD(&uprobe->consumers); init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); RB_CLEAR_NODE(&uprobe->rb_node); refcount_set(&uprobe->ref, 1); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ if (cur_uprobe != uprobe) { if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { ref_ctr_mismatch_warn(cur_uprobe, uprobe); put_uprobe(cur_uprobe); kfree(uprobe); return ERR_PTR(-EINVAL); } kfree(uprobe); uprobe = cur_uprobe; } return uprobe; } static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { static atomic64_t id; down_write(&uprobe->consumer_rwsem); list_add_rcu(&uc->cons_node, &uprobe->consumers); uc->id = (__u64) atomic64_inc_return(&id); up_write(&uprobe->consumer_rwsem); } /* * For uprobe @uprobe, delete the consumer @uc. * Should never be called with consumer that's not part of @uprobe->consumers. */ static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); list_del_rcu(&uc->cons_node); up_write(&uprobe->consumer_rwsem); } static int __copy_insn(struct address_space *mapping, struct file *filp, void *insn, int nbytes, loff_t offset) { struct page *page; /* * Ensure that the page that has the original instruction is populated * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register(). */ if (mapping->a_ops->read_folio) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; } static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping = uprobe->inode->i_mapping; loff_t offs = uprobe->offset; void *insn = &uprobe->arch.insn; int size = sizeof(uprobe->arch.insn); int len, err = -EIO; /* Copy only available bytes, -EIO if nothing was read */ do { if (offs >= i_size_read(uprobe->inode)) break; len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); err = __copy_insn(mapping, filp, insn, len, offs); if (err) break; insn += len; offs += len; size -= len; } while (size); return err; } static int prepare_uprobe(struct uprobe *uprobe, struct file *file, struct mm_struct *mm, unsigned long vaddr) { int ret = 0; if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret; /* TODO: move this into _register, until then we abuse this sem. */ down_write(&uprobe->consumer_rwsem); if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out; ret = copy_insn(uprobe, file); if (ret) goto out; ret = -ENOTSUPP; if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) goto out; smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: up_write(&uprobe->consumer_rwsem); return ret; } static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { return !uc->filter || uc->filter(uc, mm); } static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { ret = consumer_filter(uc, mm); if (ret) break; } up_read(&uprobe->consumer_rwsem); return ret; } static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long vaddr) { struct mm_struct *mm = vma->vm_mm; bool first_uprobe; int ret; ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); if (ret) return ret; /* * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), * the task can hit this breakpoint right after __replace_page(). */ first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm); if (first_uprobe) mm_flags_set(MMF_HAS_UPROBES, mm); ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) mm_flags_clear(MMF_RECALC_UPROBES, mm); else if (first_uprobe) mm_flags_clear(MMF_HAS_UPROBES, mm); return ret; } static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long vaddr) { struct mm_struct *mm = vma->vm_mm; mm_flags_set(MMF_RECALC_UPROBES, mm); return set_orig_insn(&uprobe->arch, vma, vaddr); } struct map_info { struct map_info *next; struct mm_struct *mm; unsigned long vaddr; }; static inline struct map_info *free_map_info(struct map_info *info) { struct map_info *next = info->next; kfree(info); return next; } static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; struct map_info *info; int more = 0; again: i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; if (!prev && !more) { /* * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), GFP_NOWAIT | __GFP_NOMEMALLOC); if (prev) prev->next = NULL; } if (!prev) { more++; continue; } if (!mmget_not_zero(vma->vm_mm)) continue; info = prev; prev = prev->next; info->next = curr; curr = info; info->mm = vma->vm_mm; info->vaddr = offset_to_vaddr(vma, offset); } i_mmap_unlock_read(mapping); if (!more) goto out; prev = curr; while (curr) { mmput(curr->mm); curr = curr->next; } do { info = kmalloc(sizeof(struct map_info), GFP_KERNEL); if (!info) { curr = ERR_PTR(-ENOMEM); goto out; } info->next = prev; prev = info; } while (--more); goto again; out: while (prev) prev = free_map_info(prev); return curr; } static int register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) { bool is_register = !!new; struct map_info *info; int err = 0; percpu_down_write(&dup_mmap_sem); info = build_map_info(uprobe->inode->i_mapping, uprobe->offset, is_register); if (IS_ERR(info)) { err = PTR_ERR(info); goto out; } while (info) { struct mm_struct *mm = info->mm; struct vm_area_struct *vma; if (err && is_register) goto free; /* * We take mmap_lock for writing to avoid the race with * find_active_uprobe_rcu() which takes mmap_lock for reading. * Thus this install_breakpoint() can not make * is_trap_at_addr() true right after find_uprobe_rcu() * returns NULL in find_active_uprobe_rcu(). */ mmap_write_lock(mm); if (check_stable_address_space(mm)) goto unlock; vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) goto unlock; if (vma->vm_start > info->vaddr || vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; if (is_register) { /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, vma, info->vaddr); } else if (mm_flags_test(MMF_HAS_UPROBES, mm)) { if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, vma, info->vaddr); } unlock: mmap_write_unlock(mm); free: mmput(mm); info = free_map_info(info); } out: percpu_up_write(&dup_mmap_sem); return err; } /** * uprobe_unregister_nosync - unregister an already registered probe. * @uprobe: uprobe to remove * @uc: identify which probe if multiple probes are colocated. */ void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; down_write(&uprobe->register_rwsem); consumer_del(uprobe, uc); err = register_for_each_vma(uprobe, NULL); up_write(&uprobe->register_rwsem); /* TODO : cant unregister? schedule a worker thread */ if (unlikely(err)) { uprobe_warn(current, "unregister, leaking uprobe"); return; } put_uprobe(uprobe); } EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); void uprobe_unregister_sync(void) { /* * Now that handler_chain() and handle_uretprobe_chain() iterate over * uprobe->consumers list under RCU protection without holding * uprobe->register_rwsem, we need to wait for RCU grace period to * make sure that we can't call into just unregistered * uprobe_consumer's callbacks anymore. If we don't do that, fast and * unlucky enough caller can free consumer's memory and cause * handler_chain() or handle_uretprobe_chain() to do an use-after-free. */ synchronize_rcu_tasks_trace(); synchronize_srcu(&uretprobes_srcu); } EXPORT_SYMBOL_GPL(uprobe_unregister_sync); /** * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; /* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) return ERR_PTR(-EINVAL); /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio && !shmem_mapping(inode->i_mapping)) return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return ERR_PTR(-EINVAL); /* * This ensures that uprobe_copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) return ERR_PTR(-EINVAL); uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (IS_ERR(uprobe)) return uprobe; down_write(&uprobe->register_rwsem); consumer_add(uprobe, uc); ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); if (ret) { uprobe_unregister_nosync(uprobe, uc); /* * Registration might have partially succeeded, so we can have * this consumer being called right at this time. We need to * sync here. It's ok, it's unlikely slow path. */ uprobe_unregister_sync(); return ERR_PTR(ret); } return uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); /** * uprobe_apply - add or remove the breakpoints according to @uc->filter * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints * Return: 0 on success or negative error code. */ int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { struct uprobe_consumer *con; int ret = -ENOENT; down_write(&uprobe->register_rwsem); rcu_read_lock_trace(); list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { if (con == uc) { ret = register_for_each_vma(uprobe, add ? uc : NULL); break; } } rcu_read_unlock_trace(); up_write(&uprobe->register_rwsem); return ret; } static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_write_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; if (!valid_vma(vma, false) || file_inode(vma->vm_file) != uprobe->inode) continue; offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; if (uprobe->offset < offset || uprobe->offset >= offset + vma->vm_end - vma->vm_start) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, vma, vaddr); } mmap_write_unlock(mm); return err; } static struct rb_node * find_node_in_range(struct inode *inode, loff_t min, loff_t max) { struct rb_node *n = uprobes_tree.rb_node; while (n) { struct uprobe *u = rb_entry(n, struct uprobe, rb_node); if (inode < u->inode) { n = n->rb_left; } else if (inode > u->inode) { n = n->rb_right; } else { if (max < u->offset) n = n->rb_left; else if (min > u->offset) n = n->rb_right; else break; } } return n; } /* * For a given range in vma, build a list of probes that need to be inserted. */ static void build_probe_list(struct inode *inode, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *head) { loff_t min, max; struct rb_node *n, *t; struct uprobe *u; INIT_LIST_HEAD(head); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u)) list_add(&u->pending_list, head); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u)) list_add(&u->pending_list, head); } } read_unlock(&uprobes_treelock); } /* @vma contains reference counter, not the probed instruction. */ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) { struct list_head *pos, *q; struct delayed_uprobe *du; unsigned long vaddr; int ret = 0, err = 0; mutex_lock(&delayed_uprobe_lock); list_for_each_safe(pos, q, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (du->mm != vma->vm_mm || !valid_ref_ctr_vma(du->uprobe, vma)) continue; vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); if (ret) { update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); if (!err) err = ret; } delayed_uprobe_delete(du); } mutex_unlock(&delayed_uprobe_lock); return err; } /* * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. */ int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; if (no_uprobe_events()) return 0; if (vma->vm_file && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm)) delayed_ref_ctr_inc(vma); if (!valid_vma(vma, true)) return 0; inode = file_inode(vma->vm_file); if (!inode) return 0; mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); /* * We can race with uprobe_unregister(), this uprobe can be already * removed. But in this case filter_chain() must return false, all * consumers have gone away. */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma, vaddr); } put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); return 0; } static bool vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end) { loff_t min, max; struct inode *inode; struct rb_node *n; inode = file_inode(vma->vm_file); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); read_unlock(&uprobes_treelock); return !!n; } /* * Called in context of a munmap of a vma. */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (no_uprobe_events() || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; if (!mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm) || mm_flags_test(MMF_RECALC_UPROBES, vma->vm_mm)) return; if (vma_has_uprobes(vma, start, end)) mm_flags_set(MMF_RECALC_UPROBES, vma->vm_mm); } static vm_fault_t xol_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { struct xol_area *area = vma->vm_mm->uprobes_state.xol_area; vmf->page = area->page; get_page(vmf->page); return 0; } static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { return -EPERM; } static const struct vm_special_mapping xol_mapping = { .name = "[uprobes]", .fault = xol_fault, .mremap = xol_mremap, }; /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { struct vm_area_struct *vma; int ret; if (mmap_write_lock_killable(mm)) return -EINTR; if (mm->uprobes_state.xol_area) { ret = -EALREADY; goto fail; } if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; } } vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO| VM_SEALED_SYSMAP, &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto fail; } ret = 0; /* pairs with get_xol_area() */ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: mmap_write_unlock(mm); return ret; } void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; *psize = UPROBE_SWBP_INSN_SIZE; return &insn; } static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; unsigned long insns_size; struct xol_area *area; void *insns; area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long), GFP_KERNEL); if (!area->bitmap) goto free_area; area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); if (!area->page) goto free_bitmap; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) return area; __free_page(area->page); free_bitmap: kfree(area->bitmap); free_area: kfree(area); out: return NULL; } /* * get_xol_area - Allocate process's xol_area if necessary. * This area will be used for storing instructions for execution out of line. * * Returns the allocated area or NULL. */ static struct xol_area *get_xol_area(void) { struct mm_struct *mm = current->mm; struct xol_area *area; if (!mm->uprobes_state.xol_area) __create_xol_area(0); /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */ return area; } void __weak arch_uprobe_clear_state(struct mm_struct *mm) { } void __weak arch_uprobe_init_state(struct mm_struct *mm) { } /* * uprobe_clear_state - Free the area allocated for slots. */ void uprobe_clear_state(struct mm_struct *mm) { struct xol_area *area = mm->uprobes_state.xol_area; mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); arch_uprobe_clear_state(mm); if (!area) return; put_page(area->page); kfree(area->bitmap); kfree(area); } void uprobe_start_dup_mmap(void) { percpu_down_read(&dup_mmap_sem); } void uprobe_end_dup_mmap(void) { percpu_up_read(&dup_mmap_sem); } void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { if (mm_flags_test(MMF_HAS_UPROBES, oldmm)) { mm_flags_set(MMF_HAS_UPROBES, newmm); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ mm_flags_set(MMF_RECALC_UPROBES, newmm); } } static unsigned long xol_get_slot_nr(struct xol_area *area) { unsigned long slot_nr; slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); if (slot_nr < UINSNS_PER_PAGE) { if (!test_and_set_bit(slot_nr, area->bitmap)) return slot_nr; } return UINSNS_PER_PAGE; } /* * xol_get_insn_slot - allocate a slot for xol. */ static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask) { struct xol_area *area = get_xol_area(); unsigned long slot_nr; if (!area) return false; wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE); utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES; arch_uprobe_copy_ixol(area->page, utask->xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return true; } /* * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot() */ static void xol_free_insn_slot(struct uprobe_task *utask) { struct xol_area *area = current->mm->uprobes_state.xol_area; unsigned long offset = utask->xol_vaddr - area->vaddr; unsigned int slot_nr; utask->xol_vaddr = 0; /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */ if (WARN_ON_ONCE(offset >= PAGE_SIZE)) return; slot_nr = offset / UPROBE_XOL_SLOT_BYTES; clear_bit(slot_nr, area->bitmap); smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ if (waitqueue_active(&area->wq)) wake_up(&area->wq); } void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len) { /* Initialize the slot */ copy_to_page(page, vaddr, src, len); /* * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function. */ flush_dcache_page(page); } /** * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs * @regs: Reflects the saved state of the task after it has hit a breakpoint * instruction. * Return the address of the breakpoint instruction. */ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) { return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } unsigned long uprobe_get_trap_addr(struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (unlikely(utask && utask->active_uprobe)) return utask->vaddr; return instruction_pointer(regs); } static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) { ri->cons_cnt = 0; ri->next = utask->ri_pool; utask->ri_pool = ri; } static struct return_instance *ri_pool_pop(struct uprobe_task *utask) { struct return_instance *ri = utask->ri_pool; if (likely(ri)) utask->ri_pool = ri->next; return ri; } static void ri_free(struct return_instance *ri) { kfree(ri->extra_consumers); kfree_rcu(ri, rcu); } static void free_ret_instance(struct uprobe_task *utask, struct return_instance *ri, bool cleanup_hprobe) { unsigned seq; if (cleanup_hprobe) { enum hprobe_state hstate; (void)hprobe_consume(&ri->hprobe, &hstate); hprobe_finalize(&ri->hprobe, hstate); } /* * At this point return_instance is unlinked from utask's * return_instances list and this has become visible to ri_timer(). * If seqcount now indicates that ri_timer's return instance * processing loop isn't active, we can return ri into the pool of * to-be-reused return instances for future uretprobes. If ri_timer() * happens to be running right now, though, we fallback to safety and * just perform RCU-delated freeing of ri. * Admittedly, this is a rather simple use of seqcount, but it nicely * abstracts away all the necessary memory barriers, so we use * a well-supported kernel primitive here. */ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { /* immediate reuse of ri without RCU GP is OK */ ri_pool_push(utask, ri); } else { /* we might be racing with ri_timer(), so play it safe */ ri_free(ri); } } /* * Called with no locks held. * Called in context of an exiting or an exec-ing thread. */ void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; struct return_instance *ri, *ri_next; if (!utask) return; t->utask = NULL; WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; while (ri) { ri_next = ri->next; free_ret_instance(utask, ri, true /* cleanup_hprobe */); ri = ri_next; } /* free_ret_instance() above might add to ri_pool, so this loop should come last */ ri = utask->ri_pool; while (ri) { ri_next = ri->next; ri_free(ri); ri = ri_next; } kfree(utask); } #define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ #define for_each_ret_instance_rcu(pos, head) \ for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) static void ri_timer(struct timer_list *timer) { struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); struct return_instance *ri; /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ guard(srcu)(&uretprobes_srcu); /* RCU protects return_instance from freeing. */ guard(rcu)(); /* * See free_ret_instance() for notes on seqcount use. * We also employ raw API variants to avoid lockdep false-positive * warning complaining about enabled preemption. The timer can only be * invoked once for a uprobe_task. Therefore there can only be one * writer. The reader does not require an even sequence count to make * progress, so it is OK to remain preemptible on PREEMPT_RT. */ raw_write_seqcount_begin(&utask->ri_seqcount); for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); raw_write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) { struct uprobe_task *utask; utask = kzalloc(sizeof(*utask), GFP_KERNEL); if (!utask) return NULL; timer_setup(&utask->ri_timer, ri_timer, 0); seqcount_init(&utask->ri_seqcount); return utask; } /* * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. * * Returns: * - pointer to new uprobe_task on success * - NULL otherwise */ static struct uprobe_task *get_utask(void) { if (!current->utask) current->utask = alloc_utask(); return current->utask; } static struct return_instance *alloc_return_instance(struct uprobe_task *utask) { struct return_instance *ri; ri = ri_pool_pop(utask); if (ri) return ri; ri = kzalloc(sizeof(*ri), GFP_KERNEL); if (!ri) return ZERO_SIZE_PTR; return ri; } static struct return_instance *dup_return_instance(struct return_instance *old) { struct return_instance *ri; ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); if (!ri) return NULL; if (unlikely(old->cons_cnt > 1)) { ri->extra_consumers = kmemdup(old->extra_consumers, sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), GFP_KERNEL); if (!ri->extra_consumers) { kfree(ri); return NULL; } } return ri; } static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) { struct uprobe_task *n_utask; struct return_instance **p, *o, *n; struct uprobe *uprobe; n_utask = alloc_utask(); if (!n_utask) return -ENOMEM; t->utask = n_utask; /* protect uprobes from freeing, we'll need try_get_uprobe() them */ guard(srcu)(&uretprobes_srcu); p = &n_utask->return_instances; for (o = o_utask->return_instances; o; o = o->next) { n = dup_return_instance(o); if (!n) return -ENOMEM; /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */ uprobe = hprobe_expire(&o->hprobe, true); /* * New utask will have stable properly refcounted uprobe or * NULL. Even if we failed to get refcounted uprobe, we still * need to preserve full set of return_instances for proper * uretprobe handling and nesting in forked task. */ hprobe_init_stable(&n->hprobe, uprobe); n->next = NULL; rcu_assign_pointer(*p, n); p = &n->next; n_utask->depth++; } return 0; } static void dup_xol_work(struct callback_head *work) { if (current->flags & PF_EXITING) return; if (!__create_xol_area(current->utask->dup_xol_addr) && !fatal_signal_pending(current)) uprobe_warn(current, "dup xol area"); } /* * Called in context of a new clone/fork from copy_process. */ void uprobe_copy_process(struct task_struct *t, u64 flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; struct xol_area *area; t->utask = NULL; if (!utask || !utask->return_instances) return; if (mm == t->mm && !(flags & CLONE_VFORK)) return; if (dup_utask(t, utask)) return uprobe_warn(t, "dup ret instances"); /* The task can fork() after dup_xol_work() fails */ area = mm->uprobes_state.xol_area; if (!area) return uprobe_warn(t, "dup xol area"); if (mm == t->mm) return; t->utask->dup_xol_addr = area->vaddr; init_task_work(&t->utask->dup_xol_work, dup_xol_work); task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME); } /* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. * * Returns -1 in case the xol_area is not allocated. */ unsigned long uprobe_get_trampoline_vaddr(void) { unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area; /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ if (area) trampoline_vaddr = area->vaddr; return trampoline_vaddr; } static void cleanup_return_instances(struct uprobe_task *utask, bool chained, struct pt_regs *regs) { struct return_instance *ri = utask->return_instances, *ri_next; enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri_next = ri->next; rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; free_ret_instance(utask, ri, true /* cleanup_hprobe */); ri = ri_next; } } static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, struct return_instance *ri) { struct uprobe_task *utask = current->utask; unsigned long orig_ret_vaddr, trampoline_vaddr; bool chained; int srcu_idx; if (!get_xol_area()) goto free; if (utask->depth >= MAX_URETPROBE_DEPTH) { printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" " nestedness limit pid/tgid=%d/%d\n", current->pid, current->tgid); goto free; } trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) goto free; /* drop the entries invalidated by longjmp() */ chained = (orig_ret_vaddr == trampoline_vaddr); cleanup_return_instances(utask, chained, regs); /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an * attack from user-space. */ uprobe_warn(current, "handle tail call"); goto free; } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } /* __srcu_read_lock() because SRCU lock survives switch to user space */ srcu_idx = __srcu_read_lock(&uretprobes_srcu); ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx); ri->next = utask->return_instances; rcu_assign_pointer(utask->return_instances, ri); mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD); return; free: ri_free(ri); } /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { struct uprobe_task *utask = current->utask; int err; if (!try_get_uprobe(uprobe)) return -EINVAL; if (!xol_get_insn_slot(uprobe, utask)) { err = -ENOMEM; goto err_out; } utask->vaddr = bp_vaddr; err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { xol_free_insn_slot(utask); goto err_out; } utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return 0; err_out: put_uprobe(uprobe); return err; } /* * If we are singlestepping, then ensure this thread is not connected to * non-fatal signals until completion of singlestep. When xol insn itself * triggers the signal, restart the original insn even if the task is * already SIGKILL'ed (since coredump should report the correct ip). This * is even more important if the task has a handler for SIGSEGV/etc, The * _same_ instruction should be repeated again after return from the signal * handler, and SSTEP can never finish in this case. */ bool uprobe_deny_signal(void) { struct task_struct *t = current; struct uprobe_task *utask = t->utask; if (likely(!utask || !utask->active_uprobe)) return false; WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; set_tsk_thread_flag(t, TIF_UPROBE); } } return true; } static void mmf_recalc_uprobes(struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* * This is not strictly accurate, we can race with * uprobe_unregister() and see the already removed * uprobe if delete_uprobe() was not yet called. * Or this uprobe can be filtered out. */ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) return; } mm_flags_clear(MMF_HAS_UPROBES, mm); } static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) { struct page *page; uprobe_opcode_t opcode; int result; if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE))) return -EINVAL; pagefault_disable(); result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) goto out; result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result; uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ return is_trap_insn(&opcode); } static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; struct file *vm_file; loff_t offset; unsigned int seq; guard(rcu)(); if (!mmap_lock_speculate_try_begin(mm, &seq)) return NULL; vma = vma_lookup(mm, bp_vaddr); if (!vma) return NULL; /* * vm_file memory can be reused for another instance of struct file, * but can't be freed from under us, so it's safe to read fields from * it, even if the values are some garbage values; ultimately * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure * that whatever we speculatively found is correct */ vm_file = READ_ONCE(vma->vm_file); if (!vm_file) return NULL; offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); uprobe = find_uprobe_rcu(vm_file->f_inode, offset); if (!uprobe) return NULL; /* now double check that nothing about MM changed */ if (mmap_lock_speculate_retry(mm, seq)) return NULL; return uprobe; } /* assumes being inside RCU protected region */ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; uprobe = find_active_uprobe_speculative(bp_vaddr); if (uprobe) return uprobe; mmap_read_lock(mm); vma = vma_lookup(mm, bp_vaddr); if (vma) { if (vma->vm_file) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); uprobe = find_uprobe_rcu(inode, offset); } if (!uprobe) *is_swbp = is_trap_at_addr(mm, bp_vaddr); } else { *is_swbp = -EFAULT; } if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm)) mmf_recalc_uprobes(mm); mmap_read_unlock(mm); return uprobe; } static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) { struct return_consumer *ric; if (unlikely(ri == ZERO_SIZE_PTR)) return ri; if (unlikely(ri->cons_cnt > 0)) { ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); if (!ric) { ri_free(ri); return ZERO_SIZE_PTR; } ri->extra_consumers = ric; } ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; ric->id = id; ric->cookie = cookie; ri->cons_cnt++; return ri; } static struct return_consumer * return_consumer_find(struct return_instance *ri, int *iter, int id) { struct return_consumer *ric; int idx; for (idx = *iter; idx < ri->cons_cnt; idx++) { ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; if (ric->id == id) { *iter = idx + 1; return ric; } } return NULL; } static bool ignore_ret_handler(int rc) { return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; } static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; bool has_consumers = false, remove = true; struct return_instance *ri = NULL; struct uprobe_task *utask = current->utask; utask->auprobe = &uprobe->arch; list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; __u64 cookie = 0; int rc = 0; if (uc->handler) { rc = uc->handler(uc, regs, &cookie); WARN(rc < 0 || rc > 2, "bad rc=0x%x from %ps()\n", rc, uc->handler); } remove &= rc == UPROBE_HANDLER_REMOVE; has_consumers = true; if (!uc->ret_handler || ignore_ret_handler(rc)) continue; if (!ri) ri = alloc_return_instance(utask); if (session) ri = push_consumer(ri, uc->id, cookie); } utask->auprobe = NULL; if (!ZERO_OR_NULL_PTR(ri)) prepare_uretprobe(uprobe, regs, ri); if (remove && has_consumers) { down_read(&uprobe->register_rwsem); /* re-check that removal is still required, this time under lock */ if (!filter_chain(uprobe, current->mm)) { WARN_ON(!uprobe_is_active(uprobe)); unapply_uprobe(uprobe, current->mm); } up_read(&uprobe->register_rwsem); } } static void handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs) { struct return_consumer *ric; struct uprobe_consumer *uc; int ric_idx = 0; /* all consumers unsubscribed meanwhile */ if (unlikely(!uprobe)) return; rcu_read_lock_trace(); list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; if (uc->ret_handler) { ric = return_consumer_find(ri, &ric_idx, uc->id); if (!session || ric) uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL); } } rcu_read_unlock_trace(); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) { bool chained; do { chained = ri->chained; ri = ri->next; /* can't be NULL if chained */ } while (chained); return ri; } void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *ri_next, *next_chain; struct uprobe *uprobe; enum hprobe_state hstate; bool valid; utask = current->utask; if (!utask) goto sigill; ri = utask->return_instances; if (!ri) goto sigill; do { /* * We should throw out the frames invalidated by longjmp(). * If this chain is valid, then the next one should be alive * or NULL; the latter case means that nobody but ri->func * could hit this trampoline on return. TODO: sigaltstack(). */ next_chain = find_next_ret_chain(ri); valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { /* pop current instance from the stack of pending return instances, * as it's not pending anymore: we just fixed up original * instruction pointer in regs and are about to call handlers; * this allows fixup_uretprobe_trampoline_entries() to properly fix up * captured stack traces from uretprobe handlers, in which pending * trampoline addresses on the stack are replaced with correct * original return addresses */ ri_next = ri->next; rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) handle_uretprobe_chain(ri, uprobe, regs); hprobe_finalize(&ri->hprobe, hstate); /* We already took care of hprobe, no need to waste more time on that. */ free_ret_instance(utask, ri, false /* !cleanup_hprobe */); ri = ri_next; } while (ri != next_chain); } while (!valid); return; sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); force_sig(SIGILL); } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) { return false; } bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs) { return true; } void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) { } /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. */ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs); rcu_read_lock_trace(); uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't * access this memory. The latter is only possible if * another thread plays with our ->mm. In both cases * we can simply restart. If this vma was unmapped we * can pretend this insn was not executed yet and get * the (correct) SIGSEGV after restart. */ instruction_pointer_set(regs, bp_vaddr); } goto out; } /* change it in advance for ->handler() and restart */ instruction_pointer_set(regs, bp_vaddr); /* * TODO: move copy_insn/etc into _register and remove this hack. * After we hit the bp, _unregister + _register can install the * new and not-yet-analyzed uprobe at the same address, restart. */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; /* * Pairs with the smp_wmb() in prepare_uprobe(). * * Guarantees that if we see the UPROBE_COPY_INSN bit set, then * we must also see the stores to &uprobe->arch performed by the * prepare_uprobe() call. */ smp_rmb(); /* Tracing handlers use ->utask to communicate with fetch methods */ if (!get_utask()) goto out; if (arch_uprobe_ignore(&uprobe->arch, regs)) goto out; handler_chain(uprobe, regs); /* Try to optimize after first hit. */ arch_uprobe_optimize(&uprobe->arch, bp_vaddr); /* * If user decided to take execution elsewhere, it makes little sense * to execute the original instruction, so let's skip it. */ if (instruction_pointer(regs) != bp_vaddr) goto out; if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; if (pre_ssout(uprobe, regs, bp_vaddr)) goto out; out: /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ rcu_read_unlock_trace(); } void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) { struct uprobe *uprobe; int is_swbp; guard(rcu_tasks_trace)(); uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) return; if (!get_utask()) return; if (arch_uprobe_ignore(&uprobe->arch, regs)) return; handler_chain(uprobe, regs); } /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. */ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else WARN_ON_ONCE(1); put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; xol_free_insn_slot(utask); if (utask->signal_denied) { set_thread_flag(TIF_SIGPENDING); utask->signal_denied = false; } if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); force_sig(SIGILL); } } /* * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and * allows the thread to return from interrupt. After that handle_swbp() * sets utask->active_uprobe. * * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag * and allows the thread to return from interrupt. * * While returning to userspace, thread notices the TIF_UPROBE flag and calls * uprobe_notify_resume(). */ void uprobe_notify_resume(struct pt_regs *regs) { struct uprobe_task *utask; clear_thread_flag(TIF_UPROBE); utask = current->utask; if (utask && utask->active_uprobe) handle_singlestep(utask, regs); else handle_swbp(regs); } /* * uprobe_pre_sstep_notifier gets called from interrupt context as part of * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit. */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { if (!current->mm) return 0; if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) && (!current->utask || !current->utask->return_instances)) return 0; set_thread_flag(TIF_UPROBE); return 1; } /* * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep. */ int uprobe_post_sstep_notifier(struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (!current->mm || !utask || !utask->active_uprobe) /* task is currently not uprobed */ return 0; utask->state = UTASK_SSTEP_ACK; set_thread_flag(TIF_UPROBE); return 1; } static struct notifier_block uprobe_exception_nb = { .notifier_call = arch_uprobe_exception_notify, .priority = INT_MAX-1, /* notified after kprobes, kgdb */ }; void __init uprobes_init(void) { int i; for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); BUG_ON(register_die_notifier(&uprobe_exception_nb)); }
25 24 24 22 21 19 20 20 20 19 20 1 19 19 19 19 18 19 2 2 22 24 3 3 2 1 3 12 12 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 // SPDX-License-Identifier: GPL-2.0-only #include <linux/fs.h> #include <linux/fs_struct.h> #include <linux/kernel_read_file.h> #include <linux/security.h> #include <linux/vmalloc.h> /** * kernel_read_file() - read file contents into a kernel buffer * * @file: file to read from * @offset: where to start reading from (see below). * @buf: pointer to a "void *" buffer for reading into (if * *@buf is NULL, a buffer will be allocated, and * @buf_size will be ignored) * @buf_size: size of buf, if already allocated. If @buf not * allocated, this is the largest size to allocate. * @file_size: if non-NULL, the full size of @file will be * written here. * @id: the kernel_read_file_id identifying the type of * file contents being read (for LSMs to examine) * * @offset must be 0 unless both @buf and @file_size are non-NULL * (i.e. the caller must be expecting to read partial file contents * via an already-allocated @buf, in at most @buf_size chunks, and * will be able to determine when the entire file was read by * checking @file_size). This isn't a recommended way to read a * file, though, since it is possible that the contents might * change between calls to kernel_read_file(). * * Returns number of bytes read (no single read will be bigger * than SSIZE_MAX), or negative on error. * */ ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf, size_t buf_size, size_t *file_size, enum kernel_read_file_id id) { loff_t i_size, pos; ssize_t copied; void *allocated = NULL; bool whole_file; int ret; if (offset != 0 && (!*buf || !file_size)) return -EINVAL; if (!S_ISREG(file_inode(file)->i_mode)) return -EINVAL; ret = deny_write_access(file); if (ret) return ret; i_size = i_size_read(file_inode(file)); if (i_size <= 0) { ret = -EINVAL; goto out; } /* The file is too big for sane activities. */ if (i_size > SSIZE_MAX) { ret = -EFBIG; goto out; } /* The entire file cannot be read in one buffer. */ if (!file_size && offset == 0 && i_size > buf_size) { ret = -EFBIG; goto out; } whole_file = (offset == 0 && i_size <= buf_size); ret = security_kernel_read_file(file, id, whole_file); if (ret) goto out; if (file_size) *file_size = i_size; if (!*buf) *buf = allocated = vmalloc(i_size); if (!*buf) { ret = -ENOMEM; goto out; } pos = offset; copied = 0; while (copied < buf_size) { ssize_t bytes; size_t wanted = min_t(size_t, buf_size - copied, i_size - pos); bytes = kernel_read(file, *buf + copied, wanted, &pos); if (bytes < 0) { ret = bytes; goto out_free; } if (bytes == 0) break; copied += bytes; } if (whole_file) { if (pos != i_size) { ret = -EIO; goto out_free; } ret = security_kernel_post_read_file(file, *buf, i_size, id); } out_free: if (ret < 0) { if (allocated) { vfree(*buf); *buf = NULL; } } out: allow_write_access(file); return ret == 0 ? copied : ret; } EXPORT_SYMBOL_GPL(kernel_read_file); ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf, size_t buf_size, size_t *file_size, enum kernel_read_file_id id) { struct file *file; ssize_t ret; if (!path || !*path) return -EINVAL; file = filp_open(path, O_RDONLY, 0); if (IS_ERR(file)) return PTR_ERR(file); ret = kernel_read_file(file, offset, buf, buf_size, file_size, id); fput(file); return ret; } EXPORT_SYMBOL_GPL(kernel_read_file_from_path); ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset, void **buf, size_t buf_size, size_t *file_size, enum kernel_read_file_id id) { struct file *file; struct path root; ssize_t ret; if (!path || !*path) return -EINVAL; task_lock(&init_task); get_fs_root(init_task.fs, &root); task_unlock(&init_task); file = file_open_root(&root, path, O_RDONLY, 0); path_put(&root); if (IS_ERR(file)) return PTR_ERR(file); ret = kernel_read_file(file, offset, buf, buf_size, file_size, id); fput(file); return ret; } EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns); ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf, size_t buf_size, size_t *file_size, enum kernel_read_file_id id) { CLASS(fd, f)(fd); if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) return -EBADF; return kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id); } EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
2 2 2 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 // SPDX-License-Identifier: GPL-2.0 /* * Common corrected MCE threshold handler code: */ #include <linux/interrupt.h> #include <linux/kernel.h> #include <asm/irq_vectors.h> #include <asm/traps.h> #include <asm/apic.h> #include <asm/mce.h> #include <asm/trace/irq_vectors.h> #include "internal.h" static void default_threshold_interrupt(void) { pr_err("Unexpected threshold interrupt at vector %x\n", THRESHOLD_APIC_VECTOR); } void (*mce_threshold_vector)(void) = default_threshold_interrupt; DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) { trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); apic_eoi(); } DEFINE_PER_CPU(struct mca_storm_desc, storm_desc); void mce_inherit_storm(unsigned int bank) { struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); /* * Previous CPU owning this bank had put it into storm mode, * but the precise history of that storm is unknown. Assume * the worst (all recent polls of the bank found a valid error * logged). This will avoid the new owner prematurely declaring * the storm has ended. */ storm->banks[bank].history = ~0ull; storm->banks[bank].timestamp = jiffies; } bool mce_get_storm_mode(void) { return __this_cpu_read(storm_desc.poll_mode); } void mce_set_storm_mode(bool storm) { __this_cpu_write(storm_desc.poll_mode, storm); } static void mce_handle_storm(unsigned int bank, bool on) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: mce_intel_handle_storm(bank, on); break; } } void cmci_storm_begin(unsigned int bank) { struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); __set_bit(bank, this_cpu_ptr(mce_poll_banks)); storm->banks[bank].in_storm_mode = true; /* * If this is the first bank on this CPU to enter storm mode * start polling. */ if (++storm->stormy_bank_count == 1) mce_timer_kick(true); } void cmci_storm_end(unsigned int bank) { struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); storm->banks[bank].history = 0; storm->banks[bank].in_storm_mode = false; /* If no banks left in storm mode, stop polling. */ if (!--storm->stormy_bank_count) mce_timer_kick(false); } void mce_track_storm(struct mce *mce) { struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); unsigned long now = jiffies, delta; unsigned int shift = 1; u64 history = 0; /* No tracking needed for banks that do not support CMCI */ if (storm->banks[mce->bank].poll_only) return; /* * When a bank is in storm mode it is polled once per second and * the history mask will record about the last minute of poll results. * If it is not in storm mode, then the bank is only checked when * there is a CMCI interrupt. Check how long it has been since * this bank was last checked, and adjust the amount of "shift" * to apply to history. */ if (!storm->banks[mce->bank].in_storm_mode) { delta = now - storm->banks[mce->bank].timestamp; shift = (delta + HZ) / HZ; } /* If it has been a long time since the last poll, clear history. */ if (shift < NUM_HISTORY_BITS) history = storm->banks[mce->bank].history << shift; storm->banks[mce->bank].timestamp = now; /* History keeps track of corrected errors. VAL=1 && UC=0 */ if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce)) history |= 1; storm->banks[mce->bank].history = history; if (storm->banks[mce->bank].in_storm_mode) { if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0)) return; printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank); mce_handle_storm(mce->bank, false); cmci_storm_end(mce->bank); } else { if (hweight64(history) < STORM_BEGIN_THRESHOLD) return; printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank); mce_handle_storm(mce->bank, true); cmci_storm_begin(mce->bank); } }
5 4 4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 // SPDX-License-Identifier: GPL-2.0-only /* * Intel & MS High Precision Event Timer Implementation. * * Copyright (C) 2003 Intel Corporation * Venki Pallipadi * (c) Copyright 2004 Hewlett-Packard Development Company, L.P. * Bob Picco <robert.picco@hp.com> */ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/miscdevice.h> #include <linux/major.h> #include <linux/ioport.h> #include <linux/fcntl.h> #include <linux/init.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/poll.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/sysctl.h> #include <linux/wait.h> #include <linux/sched/signal.h> #include <linux/bcd.h> #include <linux/seq_file.h> #include <linux/bitops.h> #include <linux/compat.h> #include <linux/clocksource.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/io.h> #include <linux/acpi.h> #include <linux/hpet.h> #include <asm/current.h> #include <asm/irq.h> #include <asm/div64.h> /* * The High Precision Event Timer driver. * This driver is closely modelled after the rtc.c driver. * See HPET spec revision 1. */ #define HPET_USER_FREQ (64) #define HPET_DRIFT (500) #define HPET_RANGE_SIZE 1024 /* from HPET spec */ /* WARNING -- don't get confused. These macros are never used * to write the (single) counter, and rarely to read it. * They're badly named; to fix, someday. */ #if BITS_PER_LONG == 64 #define write_counter(V, MC) writeq(V, MC) #define read_counter(MC) readq(MC) #else #define write_counter(V, MC) writel(V, MC) #define read_counter(MC) readl(MC) #endif static DEFINE_MUTEX(hpet_mutex); /* replaces BKL */ static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ; /* A lock for concurrent access by app and isr hpet activity. */ static DEFINE_SPINLOCK(hpet_lock); #define HPET_DEV_NAME (7) struct hpet_dev { struct hpets *hd_hpets; struct hpet __iomem *hd_hpet; struct hpet_timer __iomem *hd_timer; unsigned long hd_ireqfreq; unsigned long hd_irqdata; wait_queue_head_t hd_waitqueue; struct fasync_struct *hd_async_queue; unsigned int hd_flags; unsigned int hd_irq; unsigned int hd_hdwirq; char hd_name[HPET_DEV_NAME]; }; struct hpets { struct hpets *hp_next; struct hpet __iomem *hp_hpet; unsigned long hp_hpet_phys; unsigned long long hp_tick_freq; unsigned long hp_delta; unsigned int hp_ntimer; unsigned int hp_which; struct hpet_dev hp_dev[] __counted_by(hp_ntimer); }; static struct hpets *hpets; #define HPET_OPEN 0x0001 #define HPET_IE 0x0002 /* interrupt enabled */ #define HPET_PERIODIC 0x0004 #define HPET_SHARED_IRQ 0x0008 static irqreturn_t hpet_interrupt(int irq, void *data) { struct hpet_dev *devp; unsigned long isr; devp = data; isr = 1 << (devp - devp->hd_hpets->hp_dev); if ((devp->hd_flags & HPET_SHARED_IRQ) && !(isr & readl(&devp->hd_hpet->hpet_isr))) return IRQ_NONE; spin_lock(&hpet_lock); devp->hd_irqdata++; /* * For non-periodic timers, increment the accumulator. * This has the effect of treating non-periodic like periodic. */ if ((devp->hd_flags & (HPET_IE | HPET_PERIODIC)) == HPET_IE) { unsigned long t, mc, base, k; struct hpet __iomem *hpet = devp->hd_hpet; struct hpets *hpetp = devp->hd_hpets; t = devp->hd_ireqfreq; read_counter(&devp->hd_timer->hpet_compare); mc = read_counter(&hpet->hpet_mc); /* The time for the next interrupt would logically be t + m, * however, if we are very unlucky and the interrupt is delayed * for longer than t then we will completely miss the next * interrupt if we set t + m and an application will hang. * Therefore we need to make a more complex computation assuming * that there exists a k for which the following is true: * k * t + base < mc + delta * (k + 1) * t + base > mc + delta * where t is the interval in hpet ticks for the given freq, * base is the theoretical start value 0 < base < t, * mc is the main counter value at the time of the interrupt, * delta is the time it takes to write the a value to the * comparator. * k may then be computed as (mc - base + delta) / t . */ base = mc % t; k = (mc - base + hpetp->hp_delta) / t; write_counter(t * (k + 1) + base, &devp->hd_timer->hpet_compare); } if (devp->hd_flags & HPET_SHARED_IRQ) writel(isr, &devp->hd_hpet->hpet_isr); spin_unlock(&hpet_lock); wake_up_interruptible(&devp->hd_waitqueue); kill_fasync(&devp->hd_async_queue, SIGIO, POLL_IN); return IRQ_HANDLED; } static void hpet_timer_set_irq(struct hpet_dev *devp) { const unsigned int nr_irqs = irq_get_nr_irqs(); unsigned long v; int irq, gsi; struct hpet_timer __iomem *timer; spin_lock_irq(&hpet_lock); if (devp->hd_hdwirq) { spin_unlock_irq(&hpet_lock); return; } timer = devp->hd_timer; /* we prefer level triggered mode */ v = readl(&timer->hpet_config); if (!(v & Tn_INT_TYPE_CNF_MASK)) { v |= Tn_INT_TYPE_CNF_MASK; writel(v, &timer->hpet_config); } spin_unlock_irq(&hpet_lock); v = (readq(&timer->hpet_config) & Tn_INT_ROUTE_CAP_MASK) >> Tn_INT_ROUTE_CAP_SHIFT; /* * In PIC mode, skip IRQ0-4, IRQ6-9, IRQ12-15 which is always used by * legacy device. In IO APIC mode, we skip all the legacy IRQS. */ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) v &= ~0xf3df; else v &= ~0xffff; for_each_set_bit(irq, &v, HPET_MAX_IRQ) { if (irq >= nr_irqs) { irq = HPET_MAX_IRQ; break; } gsi = acpi_register_gsi(NULL, irq, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); if (gsi > 0) break; /* FIXME: Setup interrupt source table */ } if (irq < HPET_MAX_IRQ) { spin_lock_irq(&hpet_lock); v = readl(&timer->hpet_config); v |= irq << Tn_INT_ROUTE_CNF_SHIFT; writel(v, &timer->hpet_config); devp->hd_hdwirq = gsi; spin_unlock_irq(&hpet_lock); } return; } static int hpet_open(struct inode *inode, struct file *file) { struct hpet_dev *devp; struct hpets *hpetp; int i; if (file->f_mode & FMODE_WRITE) return -EINVAL; mutex_lock(&hpet_mutex); spin_lock_irq(&hpet_lock); for (devp = NULL, hpetp = hpets; hpetp && !devp; hpetp = hpetp->hp_next) for (i = 0; i < hpetp->hp_ntimer; i++) if (hpetp->hp_dev[i].hd_flags & HPET_OPEN) { continue; } else { devp = &hpetp->hp_dev[i]; break; } if (!devp) { spin_unlock_irq(&hpet_lock); mutex_unlock(&hpet_mutex); return -EBUSY; } file->private_data = devp; devp->hd_irqdata = 0; devp->hd_flags |= HPET_OPEN; spin_unlock_irq(&hpet_lock); mutex_unlock(&hpet_mutex); hpet_timer_set_irq(devp); return 0; } static ssize_t hpet_read(struct file *file, char __user *buf, size_t count, loff_t * ppos) { DECLARE_WAITQUEUE(wait, current); unsigned long data; ssize_t retval; struct hpet_dev *devp; devp = file->private_data; if (!devp->hd_ireqfreq) return -EIO; if (in_compat_syscall()) { if (count < sizeof(compat_ulong_t)) return -EINVAL; } else { if (count < sizeof(unsigned long)) return -EINVAL; } add_wait_queue(&devp->hd_waitqueue, &wait); for ( ; ; ) { set_current_state(TASK_INTERRUPTIBLE); spin_lock_irq(&hpet_lock); data = devp->hd_irqdata; devp->hd_irqdata = 0; spin_unlock_irq(&hpet_lock); if (data) { break; } else if (file->f_flags & O_NONBLOCK) { retval = -EAGAIN; goto out; } else if (signal_pending(current)) { retval = -ERESTARTSYS; goto out; } schedule(); } if (in_compat_syscall()) { retval = put_user(data, (compat_ulong_t __user *)buf); if (!retval) retval = sizeof(compat_ulong_t); } else { retval = put_user(data, (unsigned long __user *)buf); if (!retval) retval = sizeof(unsigned long); } out: __set_current_state(TASK_RUNNING); remove_wait_queue(&devp->hd_waitqueue, &wait); return retval; } static __poll_t hpet_poll(struct file *file, poll_table * wait) { unsigned long v; struct hpet_dev *devp; devp = file->private_data; if (!devp->hd_ireqfreq) return 0; poll_wait(file, &devp->hd_waitqueue, wait); spin_lock_irq(&hpet_lock); v = devp->hd_irqdata; spin_unlock_irq(&hpet_lock); if (v != 0) return EPOLLIN | EPOLLRDNORM; return 0; } #ifdef CONFIG_HPET_MMAP #ifdef CONFIG_HPET_MMAP_DEFAULT static int hpet_mmap_enabled = 1; #else static int hpet_mmap_enabled = 0; #endif static __init int hpet_mmap_enable(char *str) { get_option(&str, &hpet_mmap_enabled); pr_info("HPET mmap %s\n", hpet_mmap_enabled ? "enabled" : "disabled"); return 1; } __setup("hpet_mmap=", hpet_mmap_enable); static int hpet_mmap(struct file *file, struct vm_area_struct *vma) { struct hpet_dev *devp; unsigned long addr; if (!hpet_mmap_enabled) return -EACCES; devp = file->private_data; addr = devp->hd_hpets->hp_hpet_phys; if (addr & (PAGE_SIZE - 1)) return -ENOSYS; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return vm_iomap_memory(vma, addr, PAGE_SIZE); } #else static int hpet_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } #endif static int hpet_fasync(int fd, struct file *file, int on) { struct hpet_dev *devp; devp = file->private_data; if (fasync_helper(fd, file, on, &devp->hd_async_queue) >= 0) return 0; else return -EIO; } static int hpet_release(struct inode *inode, struct file *file) { struct hpet_dev *devp; struct hpet_timer __iomem *timer; int irq = 0; devp = file->private_data; timer = devp->hd_timer; spin_lock_irq(&hpet_lock); writeq((readq(&timer->hpet_config) & ~Tn_INT_ENB_CNF_MASK), &timer->hpet_config); irq = devp->hd_irq; devp->hd_irq = 0; devp->hd_ireqfreq = 0; if (devp->hd_flags & HPET_PERIODIC && readq(&timer->hpet_config) & Tn_TYPE_CNF_MASK) { unsigned long v; v = readq(&timer->hpet_config); v ^= Tn_TYPE_CNF_MASK; writeq(v, &timer->hpet_config); } devp->hd_flags &= ~(HPET_OPEN | HPET_IE | HPET_PERIODIC); spin_unlock_irq(&hpet_lock); if (irq) free_irq(irq, devp); file->private_data = NULL; return 0; } static int hpet_ioctl_ieon(struct hpet_dev *devp) { struct hpet_timer __iomem *timer; struct hpet __iomem *hpet; struct hpets *hpetp; int irq; unsigned long g, v, t, m; unsigned long flags, isr; timer = devp->hd_timer; hpet = devp->hd_hpet; hpetp = devp->hd_hpets; if (!devp->hd_ireqfreq) return -EIO; spin_lock_irq(&hpet_lock); if (devp->hd_flags & HPET_IE) { spin_unlock_irq(&hpet_lock); return -EBUSY; } devp->hd_flags |= HPET_IE; if (readl(&timer->hpet_config) & Tn_INT_TYPE_CNF_MASK) devp->hd_flags |= HPET_SHARED_IRQ; spin_unlock_irq(&hpet_lock); irq = devp->hd_hdwirq; if (irq) { unsigned long irq_flags; if (devp->hd_flags & HPET_SHARED_IRQ) { /* * To prevent the interrupt handler from seeing an * unwanted interrupt status bit, program the timer * so that it will not fire in the near future ... */ writel(readl(&timer->hpet_config) & ~Tn_TYPE_CNF_MASK, &timer->hpet_config); write_counter(read_counter(&hpet->hpet_mc), &timer->hpet_compare); /* ... and clear any left-over status. */ isr = 1 << (devp - devp->hd_hpets->hp_dev); writel(isr, &hpet->hpet_isr); } sprintf(devp->hd_name, "hpet%d", (int)(devp - hpetp->hp_dev)); irq_flags = devp->hd_flags & HPET_SHARED_IRQ ? IRQF_SHARED : 0; if (request_irq(irq, hpet_interrupt, irq_flags, devp->hd_name, (void *)devp)) { printk(KERN_ERR "hpet: IRQ %d is not free\n", irq); irq = 0; } } if (irq == 0) { spin_lock_irq(&hpet_lock); devp->hd_flags ^= HPET_IE; spin_unlock_irq(&hpet_lock); return -EIO; } devp->hd_irq = irq; t = devp->hd_ireqfreq; v = readq(&timer->hpet_config); /* 64-bit comparators are not yet supported through the ioctls, * so force this into 32-bit mode if it supports both modes */ g = v | Tn_32MODE_CNF_MASK | Tn_INT_ENB_CNF_MASK; if (devp->hd_flags & HPET_PERIODIC) { g |= Tn_TYPE_CNF_MASK; v |= Tn_TYPE_CNF_MASK | Tn_VAL_SET_CNF_MASK; writeq(v, &timer->hpet_config); local_irq_save(flags); /* * NOTE: First we modify the hidden accumulator * register supported by periodic-capable comparators. * We never want to modify the (single) counter; that * would affect all the comparators. The value written * is the counter value when the first interrupt is due. */ m = read_counter(&hpet->hpet_mc); write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare); /* * Then we modify the comparator, indicating the period * for subsequent interrupt. */ write_counter(t, &timer->hpet_compare); } else { local_irq_save(flags); m = read_counter(&hpet->hpet_mc); write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare); } if (devp->hd_flags & HPET_SHARED_IRQ) { isr = 1 << (devp - devp->hd_hpets->hp_dev); writel(isr, &hpet->hpet_isr); } writeq(g, &timer->hpet_config); local_irq_restore(flags); return 0; } /* converts Hz to number of timer ticks */ static inline unsigned long hpet_time_div(struct hpets *hpets, unsigned long dis) { unsigned long long m; m = hpets->hp_tick_freq + (dis >> 1); return div64_ul(m, dis); } static int hpet_ioctl_common(struct hpet_dev *devp, unsigned int cmd, unsigned long arg, struct hpet_info *info) { struct hpet_timer __iomem *timer; struct hpets *hpetp; int err; unsigned long v; switch (cmd) { case HPET_IE_OFF: case HPET_INFO: case HPET_EPI: case HPET_DPI: case HPET_IRQFREQ: timer = devp->hd_timer; hpetp = devp->hd_hpets; break; case HPET_IE_ON: return hpet_ioctl_ieon(devp); default: return -EINVAL; } err = 0; switch (cmd) { case HPET_IE_OFF: if ((devp->hd_flags & HPET_IE) == 0) break; v = readq(&timer->hpet_config); v &= ~Tn_INT_ENB_CNF_MASK; writeq(v, &timer->hpet_config); if (devp->hd_irq) { free_irq(devp->hd_irq, devp); devp->hd_irq = 0; } devp->hd_flags ^= HPET_IE; break; case HPET_INFO: { memset(info, 0, sizeof(*info)); if (devp->hd_ireqfreq) info->hi_ireqfreq = hpet_time_div(hpetp, devp->hd_ireqfreq); info->hi_flags = readq(&timer->hpet_config) & Tn_PER_INT_CAP_MASK; info->hi_hpet = hpetp->hp_which; info->hi_timer = devp - hpetp->hp_dev; break; } case HPET_EPI: v = readq(&timer->hpet_config); if ((v & Tn_PER_INT_CAP_MASK) == 0) { err = -ENXIO; break; } devp->hd_flags |= HPET_PERIODIC; break; case HPET_DPI: v = readq(&timer->hpet_config); if ((v & Tn_PER_INT_CAP_MASK) == 0) { err = -ENXIO; break; } if (devp->hd_flags & HPET_PERIODIC && readq(&timer->hpet_config) & Tn_TYPE_CNF_MASK) { v = readq(&timer->hpet_config); v ^= Tn_TYPE_CNF_MASK; writeq(v, &timer->hpet_config); } devp->hd_flags &= ~HPET_PERIODIC; break; case HPET_IRQFREQ: if ((arg > hpet_max_freq) && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; break; } if (!arg) { err = -EINVAL; break; } devp->hd_ireqfreq = hpet_time_div(hpetp, arg); } return err; } static long hpet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct hpet_info info; int err; mutex_lock(&hpet_mutex); err = hpet_ioctl_common(file->private_data, cmd, arg, &info); mutex_unlock(&hpet_mutex); if ((cmd == HPET_INFO) && !err && (copy_to_user((void __user *)arg, &info, sizeof(info)))) err = -EFAULT; return err; } #ifdef CONFIG_COMPAT struct compat_hpet_info { compat_ulong_t hi_ireqfreq; /* Hz */ compat_ulong_t hi_flags; /* information */ unsigned short hi_hpet; unsigned short hi_timer; }; /* 32-bit types would lead to different command codes which should be * translated into 64-bit ones before passed to hpet_ioctl_common */ #define COMPAT_HPET_INFO _IOR('h', 0x03, struct compat_hpet_info) #define COMPAT_HPET_IRQFREQ _IOW('h', 0x6, compat_ulong_t) static long hpet_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct hpet_info info; int err; if (cmd == COMPAT_HPET_INFO) cmd = HPET_INFO; if (cmd == COMPAT_HPET_IRQFREQ) cmd = HPET_IRQFREQ; mutex_lock(&hpet_mutex); err = hpet_ioctl_common(file->private_data, cmd, arg, &info); mutex_unlock(&hpet_mutex); if ((cmd == HPET_INFO) && !err) { struct compat_hpet_info __user *u = compat_ptr(arg); if (put_user(info.hi_ireqfreq, &u->hi_ireqfreq) || put_user(info.hi_flags, &u->hi_flags) || put_user(info.hi_hpet, &u->hi_hpet) || put_user(info.hi_timer, &u->hi_timer)) err = -EFAULT; } return err; } #endif static const struct file_operations hpet_fops = { .owner = THIS_MODULE, .read = hpet_read, .poll = hpet_poll, .unlocked_ioctl = hpet_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = hpet_compat_ioctl, #endif .open = hpet_open, .release = hpet_release, .fasync = hpet_fasync, .mmap = hpet_mmap, }; static int hpet_is_known(struct hpet_data *hdp) { struct hpets *hpetp; for (hpetp = hpets; hpetp; hpetp = hpetp->hp_next) if (hpetp->hp_hpet_phys == hdp->hd_phys_address) return 1; return 0; } static const struct ctl_table hpet_table[] = { { .procname = "max-user-freq", .data = &hpet_max_freq, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static struct ctl_table_header *sysctl_header; /* * Adjustment for when arming the timer with * initial conditions. That is, main counter * ticks expired before interrupts are enabled. */ #define TICK_CALIBRATE (1000UL) static unsigned long __hpet_calibrate(struct hpets *hpetp) { struct hpet_timer __iomem *timer = NULL; unsigned long t, m, count, i, flags, start; struct hpet_dev *devp; int j; struct hpet __iomem *hpet; for (j = 0, devp = hpetp->hp_dev; j < hpetp->hp_ntimer; j++, devp++) if ((devp->hd_flags & HPET_OPEN) == 0) { timer = devp->hd_timer; break; } if (!timer) return 0; hpet = hpetp->hp_hpet; t = read_counter(&timer->hpet_compare); i = 0; count = hpet_time_div(hpetp, TICK_CALIBRATE); local_irq_save(flags); start = read_counter(&hpet->hpet_mc); do { m = read_counter(&hpet->hpet_mc); write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare); } while (i++, (m - start) < count); local_irq_restore(flags); return (m - start) / i; } static unsigned long hpet_calibrate(struct hpets *hpetp) { unsigned long ret = ~0UL; unsigned long tmp; /* * Try to calibrate until return value becomes stable small value. * If SMI interruption occurs in calibration loop, the return value * will be big. This avoids its impact. */ for ( ; ; ) { tmp = __hpet_calibrate(hpetp); if (ret <= tmp) break; ret = tmp; } return ret; } int hpet_alloc(struct hpet_data *hdp) { u64 cap, mcfg; struct hpet_dev *devp; u32 i, ntimer; struct hpets *hpetp; struct hpet __iomem *hpet; static struct hpets *last; u32 period; unsigned long long temp; u32 remainder; /* * hpet_alloc can be called by platform dependent code. * If platform dependent code has allocated the hpet that * ACPI has also reported, then we catch it here. */ if (hpet_is_known(hdp)) { printk(KERN_DEBUG "%s: duplicate HPET ignored\n", __func__); return 0; } hpetp = kzalloc(struct_size(hpetp, hp_dev, hdp->hd_nirqs), GFP_KERNEL); if (!hpetp) return -ENOMEM; hpetp->hp_which = hpet_nhpet++; hpetp->hp_hpet = hdp->hd_address; hpetp->hp_hpet_phys = hdp->hd_phys_address; hpetp->hp_ntimer = hdp->hd_nirqs; for (i = 0; i < hdp->hd_nirqs; i++) hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i]; hpet = hpetp->hp_hpet; cap = readq(&hpet->hpet_cap); ntimer = ((cap & HPET_NUM_TIM_CAP_MASK) >> HPET_NUM_TIM_CAP_SHIFT) + 1; if (hpetp->hp_ntimer != ntimer) { printk(KERN_WARNING "hpet: number irqs doesn't agree" " with number of timers\n"); kfree(hpetp); return -ENODEV; } if (last) last->hp_next = hpetp; else hpets = hpetp; last = hpetp; period = (cap & HPET_COUNTER_CLK_PERIOD_MASK) >> HPET_COUNTER_CLK_PERIOD_SHIFT; /* fs, 10^-15 */ temp = 1000000000000000uLL; /* 10^15 femtoseconds per second */ temp += period >> 1; /* round */ do_div(temp, period); hpetp->hp_tick_freq = temp; /* ticks per second */ printk(KERN_INFO "hpet%u: at MMIO 0x%lx, IRQ%s", hpetp->hp_which, hdp->hd_phys_address, str_plural(hpetp->hp_ntimer)); for (i = 0; i < hpetp->hp_ntimer; i++) printk(KERN_CONT "%s %u", i > 0 ? "," : "", hdp->hd_irq[i]); printk(KERN_CONT "\n"); temp = hpetp->hp_tick_freq; remainder = do_div(temp, 1000000); printk(KERN_INFO "hpet%u: %u comparators, %d-bit %u.%06u MHz counter\n", hpetp->hp_which, hpetp->hp_ntimer, cap & HPET_COUNTER_SIZE_MASK ? 64 : 32, (unsigned) temp, remainder); mcfg = readq(&hpet->hpet_config); if ((mcfg & HPET_ENABLE_CNF_MASK) == 0) { write_counter(0L, &hpet->hpet_mc); mcfg |= HPET_ENABLE_CNF_MASK; writeq(mcfg, &hpet->hpet_config); } for (i = 0, devp = hpetp->hp_dev; i < hpetp->hp_ntimer; i++, devp++) { struct hpet_timer __iomem *timer; timer = &hpet->hpet_timers[devp - hpetp->hp_dev]; devp->hd_hpets = hpetp; devp->hd_hpet = hpet; devp->hd_timer = timer; /* * If the timer was reserved by platform code, * then make timer unavailable for opens. */ if (hdp->hd_state & (1 << i)) { devp->hd_flags = HPET_OPEN; continue; } init_waitqueue_head(&devp->hd_waitqueue); } hpetp->hp_delta = hpet_calibrate(hpetp); return 0; } static acpi_status hpet_resources(struct acpi_resource *res, void *data) { struct hpet_data *hdp; acpi_status status; struct acpi_resource_address64 addr; hdp = data; status = acpi_resource_to_address64(res, &addr); if (ACPI_SUCCESS(status)) { hdp->hd_phys_address = addr.address.minimum; hdp->hd_address = ioremap(addr.address.minimum, addr.address.address_length); if (!hdp->hd_address) return AE_ERROR; if (hpet_is_known(hdp)) { iounmap(hdp->hd_address); return AE_ALREADY_EXISTS; } } else if (res->type == ACPI_RESOURCE_TYPE_FIXED_MEMORY32) { struct acpi_resource_fixed_memory32 *fixmem32; fixmem32 = &res->data.fixed_memory32; hdp->hd_phys_address = fixmem32->address; hdp->hd_address = ioremap(fixmem32->address, HPET_RANGE_SIZE); if (!hdp->hd_address) return AE_ERROR; if (hpet_is_known(hdp)) { iounmap(hdp->hd_address); return AE_ALREADY_EXISTS; } } else if (res->type == ACPI_RESOURCE_TYPE_EXTENDED_IRQ) { struct acpi_resource_extended_irq *irqp; int i, irq; irqp = &res->data.extended_irq; for (i = 0; i < irqp->interrupt_count; i++) { if (hdp->hd_nirqs >= HPET_MAX_TIMERS) break; irq = acpi_register_gsi(NULL, irqp->interrupts[i], irqp->triggering, irqp->polarity); if (irq < 0) return AE_ERROR; hdp->hd_irq[hdp->hd_nirqs] = irq; hdp->hd_nirqs++; } } return AE_OK; } static int hpet_acpi_add(struct acpi_device *device) { acpi_status result; struct hpet_data data; memset(&data, 0, sizeof(data)); result = acpi_walk_resources(device->handle, METHOD_NAME__CRS, hpet_resources, &data); if (ACPI_FAILURE(result)) return -ENODEV; if (!data.hd_address || !data.hd_nirqs) { if (data.hd_address) iounmap(data.hd_address); printk("%s: no address or irqs in _CRS\n", __func__); return -ENODEV; } return hpet_alloc(&data); } static const struct acpi_device_id hpet_device_ids[] = { {"PNP0103", 0}, {"", 0}, }; static struct acpi_driver hpet_acpi_driver = { .name = "hpet", .ids = hpet_device_ids, .ops = { .add = hpet_acpi_add, }, }; static struct miscdevice hpet_misc = { HPET_MINOR, "hpet", &hpet_fops }; static int __init hpet_init(void) { int result; result = misc_register(&hpet_misc); if (result < 0) return -ENODEV; sysctl_header = register_sysctl("dev/hpet", hpet_table); result = acpi_bus_register_driver(&hpet_acpi_driver); if (result < 0) { unregister_sysctl_table(sysctl_header); misc_deregister(&hpet_misc); return result; } return 0; } device_initcall(hpet_init); /* MODULE_AUTHOR("Bob Picco <Robert.Picco@hp.com>"); MODULE_LICENSE("GPL"); */
236 235 236 236 236 140 141 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0-or-later /* * Digital Audio (PCM) abstract layer * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/time.h> #include <linux/gcd.h> #include <sound/core.h> #include <sound/pcm.h> #include <sound/timer.h> #include "pcm_local.h" /* * Timer functions */ void snd_pcm_timer_resolution_change(struct snd_pcm_substream *substream) { unsigned long rate, mult, fsize, l, post; struct snd_pcm_runtime *runtime = substream->runtime; mult = 1000000000; rate = runtime->rate; if (snd_BUG_ON(!rate)) return; l = gcd(mult, rate); mult /= l; rate /= l; fsize = runtime->period_size; if (snd_BUG_ON(!fsize)) return; l = gcd(rate, fsize); rate /= l; fsize /= l; post = 1; while ((mult * fsize) / fsize != mult) { mult /= 2; post *= 2; } if (rate == 0) { pcm_err(substream->pcm, "pcm timer resolution out of range (rate = %u, period_size = %lu)\n", runtime->rate, runtime->period_size); runtime->timer_resolution = -1; return; } runtime->timer_resolution = (mult * fsize / rate) * post; } static unsigned long snd_pcm_timer_resolution(struct snd_timer * timer) { struct snd_pcm_substream *substream; substream = timer->private_data; return substream->runtime ? substream->runtime->timer_resolution : 0; } static int snd_pcm_timer_start(struct snd_timer * timer) { struct snd_pcm_substream *substream; substream = snd_timer_chip(timer); substream->timer_running = 1; return 0; } static int snd_pcm_timer_stop(struct snd_timer * timer) { struct snd_pcm_substream *substream; substream = snd_timer_chip(timer); substream->timer_running = 0; return 0; } static const struct snd_timer_hardware snd_pcm_timer = { .flags = SNDRV_TIMER_HW_AUTO | SNDRV_TIMER_HW_SLAVE, .resolution = 0, .ticks = 1, .c_resolution = snd_pcm_timer_resolution, .start = snd_pcm_timer_start, .stop = snd_pcm_timer_stop, }; /* * Init functions */ static void snd_pcm_timer_free(struct snd_timer *timer) { struct snd_pcm_substream *substream = timer->private_data; substream->timer = NULL; } void snd_pcm_timer_init(struct snd_pcm_substream *substream) { struct snd_timer_id tid; struct snd_timer *timer; tid.dev_sclass = SNDRV_TIMER_SCLASS_NONE; tid.dev_class = SNDRV_TIMER_CLASS_PCM; tid.card = substream->pcm->card->number; tid.device = substream->pcm->device; tid.subdevice = (substream->number << 1) | (substream->stream & 1); if (snd_timer_new(substream->pcm->card, "PCM", &tid, &timer) < 0) return; sprintf(timer->name, "PCM %s %i-%i-%i", snd_pcm_direction_name(substream->stream), tid.card, tid.device, tid.subdevice); timer->hw = snd_pcm_timer; if (snd_device_register(timer->card, timer) < 0) { snd_device_free(timer->card, timer); return; } timer->private_data = substream; timer->private_free = snd_pcm_timer_free; substream->timer = timer; } void snd_pcm_timer_done(struct snd_pcm_substream *substream) { if (substream->timer) { snd_device_free(substream->pcm->card, substream->timer); substream->timer = NULL; } }
578 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * inet6 interface/address list definitions * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IF_INET6_H #define _NET_IF_INET6_H #include <net/snmp.h> #include <linux/ipv6.h> #include <linux/refcount.h> /* inet6_dev.if_flags */ #define IF_RA_OTHERCONF 0x80 #define IF_RA_MANAGED 0x40 #define IF_RA_RCVD 0x20 #define IF_RS_SENT 0x10 #define IF_READY 0x80000000 enum { INET6_IFADDR_STATE_PREDAD, INET6_IFADDR_STATE_DAD, INET6_IFADDR_STATE_POSTDAD, INET6_IFADDR_STATE_ERRDAD, INET6_IFADDR_STATE_DEAD, }; struct inet6_ifaddr { struct in6_addr addr; __u32 prefix_len; __u32 rt_priority; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 valid_lft; __u32 prefered_lft; refcount_t refcnt; spinlock_t lock; int state; __u32 flags; __u8 dad_probes; __u8 stable_privacy_retry; __u16 scope; __u64 dad_nonce; unsigned long cstamp; /* created timestamp */ unsigned long tstamp; /* updated timestamp */ struct delayed_work dad_work; struct inet6_dev *idev; struct fib6_info *rt; struct hlist_node addr_lst; struct list_head if_list; /* * Used to safely traverse idev->addr_list in process context * if the idev->lock needed to protect idev->addr_list cannot be held. * In that case, add the items to this list temporarily and iterate * without holding idev->lock. * See addrconf_ifdown and dev_forward_change. */ struct list_head if_list_aux; struct list_head tmp_list; struct inet6_ifaddr *ifpub; int regen_count; bool tokenized; u8 ifa_proto; struct rcu_head rcu; struct in6_addr peer_addr; }; struct ip6_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct rcu_head rcu; struct in6_addr sl_addr[] __counted_by(sl_max); }; #define IP6_SFBLOCK 10 /* allocate this many at once */ struct ipv6_mc_socklist { struct in6_addr addr; int ifindex; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ipv6_mc_socklist __rcu *next; struct ip6_sf_socklist __rcu *sflist; struct rcu_head rcu; }; struct ip6_sf_list { struct ip6_sf_list __rcu *sf_next; struct in6_addr sf_addr; unsigned long sf_count[2]; /* include/exclude counts */ unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ struct rcu_head rcu; }; #define MAF_TIMER_RUNNING 0x01 #define MAF_LAST_REPORTER 0x02 #define MAF_LOADED 0x04 #define MAF_NOREPORT 0x08 #define MAF_GSQUERY 0x10 struct ifmcaddr6 { struct in6_addr mca_addr; struct inet6_dev *idev; struct ifmcaddr6 __rcu *next; struct ip6_sf_list __rcu *mca_sources; struct ip6_sf_list __rcu *mca_tomb; unsigned int mca_sfmode; unsigned char mca_crcount; unsigned long mca_sfcount[2]; struct delayed_work mca_work; unsigned int mca_flags; int mca_users; refcount_t mca_refcnt; unsigned long mca_cstamp; unsigned long mca_tstamp; struct rcu_head rcu; }; /* Anycast stuff */ struct ipv6_ac_socklist { struct in6_addr acl_addr; int acl_ifindex; struct ipv6_ac_socklist *acl_next; }; struct ifacaddr6 { struct in6_addr aca_addr; struct fib6_info *aca_rt; struct ifacaddr6 __rcu *aca_next; struct hlist_node aca_addr_lst; int aca_users; refcount_t aca_refcnt; unsigned long aca_cstamp; unsigned long aca_tstamp; struct rcu_head rcu; }; #define IFA_HOST IPV6_ADDR_LOOPBACK #define IFA_LINK IPV6_ADDR_LINKLOCAL #define IFA_SITE IPV6_ADDR_SITELOCAL struct ipv6_devstat { struct proc_dir_entry *proc_dir_entry; DEFINE_SNMP_STAT(struct ipstats_mib, ipv6); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6_mib_device, icmpv6dev); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib_device, icmpv6msgdev); }; struct inet6_dev { struct net_device *dev; netdevice_tracker dev_tracker; struct list_head addr_list; struct ifmcaddr6 __rcu *mc_list; struct ifmcaddr6 __rcu *mc_tomb; unsigned char mc_qrv; /* Query Robustness Variable */ unsigned char mc_gq_running; unsigned char mc_ifc_count; unsigned char mc_dad_count; unsigned long mc_v1_seen; /* Max time we stay in MLDv1 mode */ unsigned long mc_qi; /* Query Interval */ unsigned long mc_qri; /* Query Response Interval */ unsigned long mc_maxdelay; struct delayed_work mc_gq_work; /* general query work */ struct delayed_work mc_ifc_work; /* interface change work */ struct delayed_work mc_dad_work; /* dad complete mc work */ struct delayed_work mc_query_work; /* mld query work */ struct delayed_work mc_report_work; /* mld report work */ struct sk_buff_head mc_query_queue; /* mld query queue */ struct sk_buff_head mc_report_queue; /* mld report queue */ spinlock_t mc_query_lock; /* mld query queue lock */ spinlock_t mc_report_lock; /* mld query report lock */ struct mutex mc_lock; /* mld global lock */ struct ifacaddr6 __rcu *ac_list; rwlock_t lock; refcount_t refcnt; __u32 if_flags; int dead; u32 desync_factor; struct list_head tempaddr_list; struct in6_addr token; struct neigh_parms *nd_parms; struct ipv6_devconf cnf; struct ipv6_devstat stats; struct timer_list rs_timer; __s32 rs_interval; /* in jiffies */ __u8 rs_probes; unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ struct rcu_head rcu; unsigned int ra_mtu; }; static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf) { /* * +-------+-------+-------+-------+-------+-------+ * | 33 | 33 | DST13 | DST14 | DST15 | DST16 | * +-------+-------+-------+-------+-------+-------+ */ buf[0]= 0x33; buf[1]= 0x33; memcpy(buf + 2, &addr->s6_addr32[3], sizeof(__u32)); } static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf) { buf[0] = 0x00; } static inline void ipv6_ib_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { unsigned char scope = broadcast[5] & 0xF; buf[0] = 0; /* Reserved */ buf[1] = 0xff; /* Multicast QPN */ buf[2] = 0xff; buf[3] = 0xff; buf[4] = 0xff; buf[5] = 0x10 | scope; /* scope from broadcast address */ buf[6] = 0x60; /* IPv6 signature */ buf[7] = 0x1b; buf[8] = broadcast[8]; /* P_Key */ buf[9] = broadcast[9]; memcpy(buf + 10, addr->s6_addr + 6, 10); } static inline int ipv6_ipgre_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) { memcpy(buf, broadcast, 4); } else { /* v4mapped? */ if ((addr->s6_addr32[0] | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x0000ffff))) != 0) return -EINVAL; memcpy(buf, &addr->s6_addr32[3], 4); } return 0; } #endif
14 5 2 1 2 2 1 1 2 2 4 4 2 2 8 8 4 7 1 7 6 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * vMTRR implementation * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright(C) 2015 Intel Corporation. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> * Marcelo Tosatti <mtosatti@redhat.com> * Paolo Bonzini <pbonzini@redhat.com> * Xiao Guangrong <guangrong.xiao@linux.intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <asm/mtrr.h> #include "cpuid.h" #include "x86.h" static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr) { int index; switch (msr) { case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1): index = msr - MTRRphysBase_MSR(0); return &vcpu->arch.mtrr_state.var[index]; case MSR_MTRRfix64K_00000: return &vcpu->arch.mtrr_state.fixed_64k; case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: index = msr - MSR_MTRRfix16K_80000; return &vcpu->arch.mtrr_state.fixed_16k[index]; case MSR_MTRRfix4K_C0000: case MSR_MTRRfix4K_C8000: case MSR_MTRRfix4K_D0000: case MSR_MTRRfix4K_D8000: case MSR_MTRRfix4K_E0000: case MSR_MTRRfix4K_E8000: case MSR_MTRRfix4K_F0000: case MSR_MTRRfix4K_F8000: index = msr - MSR_MTRRfix4K_C0000; return &vcpu->arch.mtrr_state.fixed_4k[index]; case MSR_MTRRdefType: return &vcpu->arch.mtrr_state.deftype; default: break; } return NULL; } static bool valid_mtrr_type(unsigned t) { return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ } static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) { int i; u64 mask; if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; return valid_mtrr_type(data & 0xff); } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { for (i = 0; i < 8 ; i++) if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) return false; return true; } /* variable MTRRs */ if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) && msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)))) return false; mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { /* MTRR base */ if (!valid_mtrr_type(data & 0xff)) return false; mask |= 0xf00; } else { /* MTRR mask */ mask |= 0x7ff; } return (data & mask) == 0; } int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 *mtrr; mtrr = find_mtrr(vcpu, msr); if (!mtrr) return 1; if (!kvm_mtrr_valid(vcpu, msr, data)) return 1; *mtrr = data; return 0; } int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 *mtrr; /* MSR_MTRRcap is a readonly MSR. */ if (msr == MSR_MTRRcap) { /* * SMRR = 0 * WC = 1 * FIX = 1 * VCNT = KVM_NR_VAR_MTRR */ *pdata = 0x500 | KVM_NR_VAR_MTRR; return 0; } mtrr = find_mtrr(vcpu, msr); if (!mtrr) return 1; *pdata = *mtrr; return 0; }
22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 // SPDX-License-Identifier: GPL-2.0 /* * The class-specific portions of the driver model * * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org> * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2008-2009 Novell Inc. * Copyright (c) 2012-2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2012-2019 Linux Foundation * * See Documentation/driver-api/driver-model/ for more information. */ #ifndef _DEVICE_CLASS_H_ #define _DEVICE_CLASS_H_ #include <linux/kobject.h> #include <linux/klist.h> #include <linux/pm.h> #include <linux/device/bus.h> struct device; struct fwnode_handle; /** * struct class - device classes * @name: Name of the class. * @class_groups: Default attributes of this class. * @dev_groups: Default attributes of the devices that belong to the class. * @dev_uevent: Called when a device is added, removed from this class, or a * few other things that generate uevents to add the environment * variables. * @devnode: Callback to provide the devtmpfs. * @class_release: Called to release this class. * @dev_release: Called to release the device. * @shutdown_pre: Called at shut-down time before driver shutdown. * @ns_type: Callbacks so sysfs can detemine namespaces. * @namespace: Namespace of the device belongs to this class. * @get_ownership: Allows class to specify uid/gid of the sysfs directories * for the devices belonging to the class. Usually tied to * device's namespace. * @pm: The default device power management operations of this class. * * A class is a higher-level view of a device that abstracts out low-level * implementation details. Drivers may see a SCSI disk or an ATA disk, but, * at the class level, they are all simply disks. Classes allow user space * to work with devices based on what they do, rather than how they are * connected or how they work. */ struct class { const char *name; const struct attribute_group **class_groups; const struct attribute_group **dev_groups; int (*dev_uevent)(const struct device *dev, struct kobj_uevent_env *env); char *(*devnode)(const struct device *dev, umode_t *mode); void (*class_release)(const struct class *class); void (*dev_release)(struct device *dev); int (*shutdown_pre)(struct device *dev); const struct kobj_ns_type_operations *ns_type; const void *(*namespace)(const struct device *dev); void (*get_ownership)(const struct device *dev, kuid_t *uid, kgid_t *gid); const struct dev_pm_ops *pm; }; struct class_dev_iter { struct klist_iter ki; const struct device_type *type; struct subsys_private *sp; }; int __must_check class_register(const struct class *class); void class_unregister(const struct class *class); bool class_is_registered(const struct class *class); struct class_compat; struct class_compat *class_compat_register(const char *name); void class_compat_unregister(struct class_compat *cls); int class_compat_create_link(struct class_compat *cls, struct device *dev); void class_compat_remove_link(struct class_compat *cls, struct device *dev); void class_dev_iter_init(struct class_dev_iter *iter, const struct class *class, const struct device *start, const struct device_type *type); struct device *class_dev_iter_next(struct class_dev_iter *iter); void class_dev_iter_exit(struct class_dev_iter *iter); int class_for_each_device(const struct class *class, const struct device *start, void *data, device_iter_t fn); struct device *class_find_device(const struct class *class, const struct device *start, const void *data, device_match_t match); /** * class_find_device_by_name - device iterator for locating a particular device * of a specific name. * @class: class type * @name: name of the device to match */ static inline struct device *class_find_device_by_name(const struct class *class, const char *name) { return class_find_device(class, NULL, name, device_match_name); } /** * class_find_device_by_of_node : device iterator for locating a particular device * matching the of_node. * @class: class type * @np: of_node of the device to match. */ static inline struct device *class_find_device_by_of_node(const struct class *class, const struct device_node *np) { return class_find_device(class, NULL, np, device_match_of_node); } /** * class_find_device_by_fwnode : device iterator for locating a particular device * matching the fwnode. * @class: class type * @fwnode: fwnode of the device to match. */ static inline struct device *class_find_device_by_fwnode(const struct class *class, const struct fwnode_handle *fwnode) { return class_find_device(class, NULL, fwnode, device_match_fwnode); } /** * class_find_device_by_devt : device iterator for locating a particular device * matching the device type. * @class: class type * @devt: device type of the device to match. */ static inline struct device *class_find_device_by_devt(const struct class *class, dev_t devt) { return class_find_device(class, NULL, &devt, device_match_devt); } #ifdef CONFIG_ACPI struct acpi_device; /** * class_find_device_by_acpi_dev : device iterator for locating a particular * device matching the ACPI_COMPANION device. * @class: class type * @adev: ACPI_COMPANION device to match. */ static inline struct device *class_find_device_by_acpi_dev(const struct class *class, const struct acpi_device *adev) { return class_find_device(class, NULL, adev, device_match_acpi_dev); } #else static inline struct device *class_find_device_by_acpi_dev(const struct class *class, const void *adev) { return NULL; } #endif struct class_attribute { struct attribute attr; ssize_t (*show)(const struct class *class, const struct class_attribute *attr, char *buf); ssize_t (*store)(const struct class *class, const struct class_attribute *attr, const char *buf, size_t count); }; #define CLASS_ATTR_RW(_name) \ struct class_attribute class_attr_##_name = __ATTR_RW(_name) #define CLASS_ATTR_RO(_name) \ struct class_attribute class_attr_##_name = __ATTR_RO(_name) #define CLASS_ATTR_WO(_name) \ struct class_attribute class_attr_##_name = __ATTR_WO(_name) int __must_check class_create_file_ns(const struct class *class, const struct class_attribute *attr, const void *ns); void class_remove_file_ns(const struct class *class, const struct class_attribute *attr, const void *ns); static inline int __must_check class_create_file(const struct class *class, const struct class_attribute *attr) { return class_create_file_ns(class, attr, NULL); } static inline void class_remove_file(const struct class *class, const struct class_attribute *attr) { class_remove_file_ns(class, attr, NULL); } /* Simple class attribute that is just a static string */ struct class_attribute_string { struct class_attribute attr; char *str; }; /* Currently read-only only */ #define _CLASS_ATTR_STRING(_name, _mode, _str) \ { __ATTR(_name, _mode, show_class_attr_string, NULL), _str } #define CLASS_ATTR_STRING(_name, _mode, _str) \ struct class_attribute_string class_attr_##_name = \ _CLASS_ATTR_STRING(_name, _mode, _str) ssize_t show_class_attr_string(const struct class *class, const struct class_attribute *attr, char *buf); struct class_interface { struct list_head node; const struct class *class; int (*add_dev) (struct device *dev); void (*remove_dev) (struct device *dev); }; int __must_check class_interface_register(struct class_interface *); void class_interface_unregister(struct class_interface *); struct class * __must_check class_create(const char *name); void class_destroy(const struct class *cls); #endif /* _DEVICE_CLASS_H_ */
32 36 35 2 5 5 5 4 3 3 3 3 3 3 4 1 5 2 2 1 2 2 3 1 1 1 1 1 2 2 1 2 4 2 1 4 2 2 1 1 1 6 1 1 1 1 4 4 1 4 4 4 4 19 19 8 4 7 5 4 1 2 3 2 1 1 1 1 2 5 5 1 1 2 2 2 2 1 2 1 5 5 4 1 3 1 2 1 5 1 2 1 1 2 1 2 1 1 2 2 3 2 1 4 3 2 1 1 4 1 6 2 2 2 1 1 2 7 7 5 5 5 5 1 4 5 3 7 15 15 15 15 15 14 84 58 26 58 84 66 30 36 11 25 65 24 28 15 14 14 15 14 15 15 15 14 15 7 15 15 9 9 9 4 9 4 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 // SPDX-License-Identifier: GPL-2.0 /* * Functions related to sysfs handling */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/blktrace_api.h> #include <linux/debugfs.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" #include "blk-wbt.h" #include "blk-cgroup.h" #include "blk-throttle.h" struct queue_sysfs_entry { struct attribute attr; ssize_t (*show)(struct gendisk *disk, char *page); ssize_t (*show_limit)(struct gendisk *disk, char *page); ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); int (*store_limit)(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim); }; static ssize_t queue_var_show(unsigned long var, char *page) { return sysfs_emit(page, "%lu\n", var); } static ssize_t queue_var_store(unsigned long *var, const char *page, size_t count) { int err; unsigned long v; err = kstrtoul(page, 10, &v); if (err || v > UINT_MAX) return -EINVAL; *var = v; return count; } static ssize_t queue_requests_show(struct gendisk *disk, char *page) { ssize_t ret; mutex_lock(&disk->queue->elevator_lock); ret = queue_var_show(disk->queue->nr_requests, page); mutex_unlock(&disk->queue->elevator_lock); return ret; } static ssize_t queue_requests_store(struct gendisk *disk, const char *page, size_t count) { struct request_queue *q = disk->queue; struct blk_mq_tag_set *set = q->tag_set; struct elevator_tags *et = NULL; unsigned int memflags; unsigned long nr; int ret; ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; /* * Serialize updating nr_requests with concurrent queue_requests_store() * and switching elevator. */ down_write(&set->update_nr_hwq_lock); if (nr == q->nr_requests) goto unlock; if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; /* * Switching elevator is protected by update_nr_hwq_lock: * - read lock is held from elevator sysfs attribute; * - write lock is held from updating nr_hw_queues; * Hence it's safe to access q->elevator here with write lock held. */ if (nr <= set->reserved_tags || (q->elevator && nr > MAX_SCHED_RQ) || (!q->elevator && nr > set->queue_depth)) { ret = -EINVAL; goto unlock; } if (!blk_mq_is_shared_tags(set->flags) && q->elevator && nr > q->elevator->et->nr_requests) { /* * Tags will grow, allocate memory before freezing queue to * prevent deadlock. */ et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr); if (!et) { ret = -ENOMEM; goto unlock; } } memflags = blk_mq_freeze_queue(q); mutex_lock(&q->elevator_lock); et = blk_mq_update_nr_requests(q, et, nr); mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); if (et) blk_mq_free_sched_tags(et, set); unlock: up_write(&set->update_nr_hwq_lock); return ret; } static ssize_t queue_ra_show(struct gendisk *disk, char *page) { ssize_t ret; mutex_lock(&disk->queue->limits_lock); ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); mutex_unlock(&disk->queue->limits_lock); return ret; } static ssize_t queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; unsigned int memflags; struct request_queue *q = disk->queue; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; /* * ->ra_pages is protected by ->limits_lock because it is usually * calculated from the queue limits by queue_limits_commit_update. */ mutex_lock(&q->limits_lock); memflags = blk_mq_freeze_queue(q); disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); mutex_unlock(&q->limits_lock); blk_mq_unfreeze_queue(q, memflags); return ret; } #define QUEUE_SYSFS_LIMIT_SHOW(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return queue_var_show(disk->queue->limits._field, page); \ } QUEUE_SYSFS_LIMIT_SHOW(max_segments) QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) QUEUE_SYSFS_LIMIT_SHOW(max_write_streams) QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity) QUEUE_SYSFS_LIMIT_SHOW(logical_block_size) QUEUE_SYSFS_LIMIT_SHOW(physical_block_size) QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors) QUEUE_SYSFS_LIMIT_SHOW(io_min) QUEUE_SYSFS_LIMIT_SHOW(io_opt) QUEUE_SYSFS_LIMIT_SHOW(discard_granularity) QUEUE_SYSFS_LIMIT_SHOW(zone_write_granularity) QUEUE_SYSFS_LIMIT_SHOW(virt_boundary_mask) QUEUE_SYSFS_LIMIT_SHOW(dma_alignment) QUEUE_SYSFS_LIMIT_SHOW(max_open_zones) QUEUE_SYSFS_LIMIT_SHOW(max_active_zones) QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_min) QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%llu\n", \ (unsigned long long)disk->queue->limits._field << \ SECTOR_SHIFT); \ } QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_wzeroes_unmap_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_wzeroes_unmap_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return queue_var_show(disk->queue->limits._field >> 1, page); \ } QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) #define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%d\n", _val); \ } /* deprecated fields */ QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0) QUEUE_SYSFS_SHOW_CONST(write_same_max, 0) QUEUE_SYSFS_SHOW_CONST(poll_delay, -1) static int queue_max_discard_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long max_discard_bytes; ssize_t ret; ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) return ret; if (max_discard_bytes & (disk->queue->limits.discard_granularity - 1)) return -EINVAL; if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; lim->max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; return 0; } static int queue_max_wzeroes_unmap_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long max_zeroes_bytes, max_hw_zeroes_bytes; ssize_t ret; ret = queue_var_store(&max_zeroes_bytes, page, count); if (ret < 0) return ret; max_hw_zeroes_bytes = lim->max_hw_wzeroes_unmap_sectors << SECTOR_SHIFT; if (max_zeroes_bytes != 0 && max_zeroes_bytes != max_hw_zeroes_bytes) return -EINVAL; lim->max_user_wzeroes_unmap_sectors = max_zeroes_bytes >> SECTOR_SHIFT; return 0; } static int queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long max_sectors_kb; ssize_t ret; ret = queue_var_store(&max_sectors_kb, page, count); if (ret < 0) return ret; lim->max_user_sectors = max_sectors_kb << 1; return 0; } static ssize_t queue_feature_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim, blk_features_t feature) { unsigned long val; ssize_t ret; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; if (val) lim->features |= feature; else lim->features &= ~feature; return 0; } #define QUEUE_SYSFS_FEATURE(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } \ static int queue_##_name##_store(struct gendisk *disk, \ const char *page, size_t count, struct queue_limits *lim) \ { \ return queue_feature_store(disk, page, count, lim, _feature); \ } QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM) QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT) QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES); #define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA); QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); static ssize_t queue_poll_show(struct gendisk *disk, char *page) { if (queue_is_mq(disk->queue)) return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); return sysfs_emit(page, "%u\n", !!(disk->queue->limits.features & BLK_FEAT_POLL)); } static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { if (blk_queue_is_zoned(disk->queue)) return sysfs_emit(page, "host-managed\n"); return sysfs_emit(page, "none\n"); } static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) { return queue_var_show(disk_nr_zones(disk), page); } static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); } static int queue_iostats_passthrough_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long ios; ssize_t ret; ret = queue_var_store(&ios, page, count); if (ret < 0) return ret; if (ios) lim->flags |= BLK_FLAG_IOSTATS_PASSTHROUGH; else lim->flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH; return 0; } static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | blk_queue_noxmerges(disk->queue), page); } static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; unsigned int memflags; struct request_queue *q = disk->queue; ssize_t ret = queue_var_store(&nm, page, count); if (ret < 0) return ret; memflags = blk_mq_freeze_queue(q); blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); blk_mq_unfreeze_queue(q, memflags); return ret; } static ssize_t queue_rq_affinity_show(struct gendisk *disk, char *page) { bool set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags); bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags); return queue_var_show(set << force, page); } static ssize_t queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) { ssize_t ret = -EINVAL; #ifdef CONFIG_SMP struct request_queue *q = disk->queue; unsigned long val; unsigned int memflags; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; /* * Here we update two queue flags each using atomic bitops, although * updating two flags isn't atomic it should be harmless as those flags * are accessed individually using atomic test_bit operation. So we * don't grab any lock while updating these flags. */ memflags = blk_mq_freeze_queue(q); if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 1) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 0) { blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } blk_mq_unfreeze_queue(q, memflags); #endif return ret; } static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, size_t count) { return count; } static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { unsigned int memflags; ssize_t ret = count; struct request_queue *q = disk->queue; memflags = blk_mq_freeze_queue(q); if (!(q->limits.features & BLK_FEAT_POLL)) { ret = -EINVAL; goto out; } pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); out: blk_mq_unfreeze_queue(q, memflags); return ret; } static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { return sysfs_emit(page, "%u\n", jiffies_to_msecs(READ_ONCE(disk->queue->rq_timeout))); } static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { unsigned int val, memflags; int err; struct request_queue *q = disk->queue; err = kstrtou32(page, 10, &val); if (err || val == 0) return -EINVAL; memflags = blk_mq_freeze_queue(q); blk_queue_rq_timeout(q, msecs_to_jiffies(val)); blk_mq_unfreeze_queue(q, memflags); return count; } static ssize_t queue_wc_show(struct gendisk *disk, char *page) { if (blk_queue_write_cache(disk->queue)) return sysfs_emit(page, "write back\n"); return sysfs_emit(page, "write through\n"); } static int queue_wc_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { bool disable; if (!strncmp(page, "write back", 10)) { disable = false; } else if (!strncmp(page, "write through", 13) || !strncmp(page, "none", 4)) { disable = true; } else { return -EINVAL; } if (disable) lim->flags |= BLK_FLAG_WRITE_CACHE_DISABLED; else lim->flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; return 0; } #define QUEUE_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ .show = _prefix##_show, \ }; #define QUEUE_RW_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ .show = _prefix##_show, \ .store = _prefix##_store, \ }; #define QUEUE_LIM_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ .show_limit = _prefix##_show, \ } #define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ .show_limit = _prefix##_show, \ .store_limit = _prefix##_store, \ } QUEUE_RW_ENTRY(queue_requests, "nr_requests"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams"); QUEUE_LIM_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity"); QUEUE_RW_ENTRY(elv_iosched, "scheduler"); QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); QUEUE_LIM_RO_ENTRY(queue_physical_block_size, "physical_block_size"); QUEUE_LIM_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size"); QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size"); QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity"); QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors, "atomic_write_boundary_bytes"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_hw_wzeroes_unmap_sectors, "write_zeroes_unmap_max_hw_bytes"); QUEUE_LIM_RW_ENTRY(queue_max_wzeroes_unmap_sectors, "write_zeroes_unmap_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); QUEUE_RW_ENTRY(queue_poll, "io_poll"); QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache"); QUEUE_LIM_RO_ENTRY(queue_fua, "fua"); QUEUE_LIM_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment"); /* legacy alias for logical_block_size: */ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .attr = {.name = "hw_sector_size", .mode = 0444 }, .show_limit = queue_logical_block_size_show, }; QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational"); QUEUE_LIM_RW_ENTRY(queue_iostats, "iostats"); QUEUE_LIM_RW_ENTRY(queue_add_random, "add_random"); QUEUE_LIM_RW_ENTRY(queue_stable_writes, "stable_writes"); #ifdef CONFIG_BLK_WBT static ssize_t queue_var_store64(s64 *var, const char *page) { int err; s64 v; err = kstrtos64(page, 10, &v); if (err < 0) return err; *var = v; return 0; } static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) { ssize_t ret; struct request_queue *q = disk->queue; mutex_lock(&disk->rqos_state_mutex); if (!wbt_rq_qos(q)) { ret = -EINVAL; goto out; } if (wbt_disabled(q)) { ret = sysfs_emit(page, "0\n"); goto out; } ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); out: mutex_unlock(&disk->rqos_state_mutex); return ret; } static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, size_t count) { struct request_queue *q = disk->queue; struct rq_qos *rqos; ssize_t ret; s64 val; unsigned int memflags; ret = queue_var_store64(&val, page); if (ret < 0) return ret; if (val < -1) return -EINVAL; /* * Ensure that the queue is idled, in case the latency update * ends up either enabling or disabling wbt completely. We can't * have IO inflight if that happens. */ memflags = blk_mq_freeze_queue(q); rqos = wbt_rq_qos(q); if (!rqos) { ret = wbt_init(disk); if (ret) goto out; } ret = count; if (val == -1) val = wbt_default_latency_nsec(q); else if (val >= 0) val *= 1000ULL; if (wbt_get_min_lat(q) == val) goto out; blk_mq_quiesce_queue(q); mutex_lock(&disk->rqos_state_mutex); wbt_set_min_lat(q, val); mutex_unlock(&disk->rqos_state_mutex); blk_mq_unquiesce_queue(q); out: blk_mq_unfreeze_queue(q, memflags); return ret; } QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); #endif /* Common attributes for bio-based and request-based queues. */ static struct attribute *queue_attrs[] = { /* * Attributes which are protected with q->limits_lock. */ &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, &queue_max_write_streams_entry.attr, &queue_write_stream_granularity_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, &queue_chunk_sectors_entry.attr, &queue_io_min_entry.attr, &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, &queue_max_discard_sectors_entry.attr, &queue_max_hw_discard_sectors_entry.attr, &queue_atomic_write_max_sectors_entry.attr, &queue_atomic_write_boundary_sectors_entry.attr, &queue_atomic_write_unit_min_entry.attr, &queue_atomic_write_unit_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, &queue_max_hw_wzeroes_unmap_sectors_entry.attr, &queue_max_wzeroes_unmap_sectors_entry.attr, &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, &queue_zoned_entry.attr, &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, &queue_iostats_passthrough_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, &queue_add_random_entry.attr, &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, &queue_ra_entry.attr, /* * Attributes which don't require locking. */ &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, &queue_nr_zones_entry.attr, &queue_nomerges_entry.attr, &queue_poll_entry.attr, &queue_poll_delay_entry.attr, NULL, }; /* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { /* * Attributes which require some form of locking other than * q->sysfs_lock. */ &elv_iosched_entry.attr, &queue_requests_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif /* * Attributes which don't require locking. */ &queue_rq_affinity_entry.attr, &queue_io_timeout_entry.attr, NULL, }; static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if ((attr == &queue_max_open_zones_entry.attr || attr == &queue_max_active_zones_entry.attr) && !blk_queue_is_zoned(q)) return 0; return attr->mode; } static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if (!queue_is_mq(q)) return 0; if (attr == &queue_io_timeout_entry.attr && !q->mq_ops->timeout) return 0; return attr->mode; } static struct attribute_group queue_attr_group = { .attrs = queue_attrs, .is_visible = queue_attr_visible, }; static struct attribute_group blk_mq_queue_attr_group = { .attrs = blk_mq_queue_attrs, .is_visible = blk_mq_queue_attr_visible, }; #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) static ssize_t queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); if (!entry->show && !entry->show_limit) return -EIO; if (entry->show_limit) { ssize_t res; mutex_lock(&disk->queue->limits_lock); res = entry->show_limit(disk, page); mutex_unlock(&disk->queue->limits_lock); return res; } return entry->show(disk, page); } static ssize_t queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if (!entry->store_limit && !entry->store) return -EIO; if (entry->store_limit) { ssize_t res; struct queue_limits lim = queue_limits_start_update(q); res = entry->store_limit(disk, page, length, &lim); if (res < 0) { queue_limits_cancel_update(q); return res; } res = queue_limits_commit_update_frozen(q, &lim); if (res) return res; return length; } return entry->store(disk, page, length); } static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, }; static const struct attribute_group *blk_queue_attr_groups[] = { &queue_attr_group, &blk_mq_queue_attr_group, NULL }; static void blk_queue_release(struct kobject *kobj) { /* nothing to do here, all data is associated with the parent gendisk */ } const struct kobj_type blk_queue_ktype = { .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, .release = blk_queue_release, }; static void blk_debugfs_remove(struct gendisk *disk) { struct request_queue *q = disk->queue; mutex_lock(&q->debugfs_mutex); blk_trace_shutdown(q); debugfs_remove_recursive(q->debugfs_dir); q->debugfs_dir = NULL; q->sched_debugfs_dir = NULL; q->rqos_debugfs_dir = NULL; mutex_unlock(&q->debugfs_mutex); } /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. */ int blk_register_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; int ret; ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) return ret; if (queue_is_mq(q)) { ret = blk_mq_sysfs_register(disk); if (ret) goto out_del_queue_kobj; } mutex_lock(&q->sysfs_lock); mutex_lock(&q->debugfs_mutex); q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root); if (queue_is_mq(q)) blk_mq_debugfs_register(q); mutex_unlock(&q->debugfs_mutex); ret = disk_register_independent_access_ranges(disk); if (ret) goto out_debugfs_remove; ret = blk_crypto_sysfs_register(disk); if (ret) goto out_unregister_ia_ranges; if (queue_is_mq(q)) elevator_set_default(q); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); if (q->elevator) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); /* * SCSI probing may synchronously create and destroy a lot of * request_queues for non-existent devices. Shutting down a fully * functional queue takes measureable wallclock time as RCU grace * periods are involved. To avoid excessive latency in these * cases, a request_queue starts out in a degraded mode which is * faster to shut down and is made fully functional here as * request_queues for non-existent devices never get registered. */ blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); percpu_ref_switch_to_percpu(&q->q_usage_counter); return ret; out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); out_debugfs_remove: blk_debugfs_remove(disk); mutex_unlock(&q->sysfs_lock); if (queue_is_mq(q)) blk_mq_sysfs_unregister(disk); out_del_queue_kobj: kobject_del(&disk->queue_kobj); return ret; } /** * blk_unregister_queue - counterpart of blk_register_queue() * @disk: Disk of which the request queue should be unregistered from sysfs. * * Note: the caller is responsible for guaranteeing that this function is called * after blk_register_queue() has finished. */ void blk_unregister_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; if (WARN_ON(!q)) return; /* Return early if disk->queue was never registered. */ if (!blk_queue_registered(q)) return; /* * Since sysfs_remove_dir() prevents adding new directory entries * before removal of existing entries starts, protect against * concurrent elv_iosched_store() calls. */ mutex_lock(&q->sysfs_lock); blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); mutex_unlock(&q->sysfs_lock); /* * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. */ if (queue_is_mq(q)) blk_mq_sysfs_unregister(disk); blk_crypto_sysfs_unregister(disk); mutex_lock(&q->sysfs_lock); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); /* Now that we've deleted all child objects, we can delete the queue. */ kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); kobject_del(&disk->queue_kobj); if (queue_is_mq(q)) elevator_set_none(q); blk_debugfs_remove(disk); }
2 2 2 2 2 2 1 1 1 1 56 56 56 56 56 56 56 55 56 56 56 56 56 56 56 1 1 1 12 17 15 12 11 1 2 1 3 2 1 1 17 3 2 1 2 2 3 3 3 2 3 1 3 2 2 2 3 13 13 18 18 18 18 18 18 18 60 60 60 13 13 2 2 2 1 2 12 4 4 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 // SPDX-License-Identifier: GPL-2.0 /* * Wireless utility functions * * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH * Copyright (C) 2018-2023, 2025 Intel Corporation */ #include <linux/export.h> #include <linux/bitops.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <linux/ieee80211.h> #include <net/cfg80211.h> #include <net/ip.h> #include <net/dsfield.h> #include <linux/if_vlan.h> #include <linux/mpls.h> #include <linux/gcd.h> #include <linux/bitfield.h> #include <linux/nospec.h> #include "core.h" #include "rdev-ops.h" const struct ieee80211_rate * ieee80211_get_response_rate(struct ieee80211_supported_band *sband, u32 basic_rates, int bitrate) { struct ieee80211_rate *result = &sband->bitrates[0]; int i; for (i = 0; i < sband->n_bitrates; i++) { if (!(basic_rates & BIT(i))) continue; if (sband->bitrates[i].bitrate > bitrate) continue; result = &sband->bitrates[i]; } return result; } EXPORT_SYMBOL(ieee80211_get_response_rate); u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband) { struct ieee80211_rate *bitrates; u32 mandatory_rates = 0; enum ieee80211_rate_flags mandatory_flag; int i; if (WARN_ON(!sband)) return 1; if (sband->band == NL80211_BAND_2GHZ) mandatory_flag = IEEE80211_RATE_MANDATORY_B; else mandatory_flag = IEEE80211_RATE_MANDATORY_A; bitrates = sband->bitrates; for (i = 0; i < sband->n_bitrates; i++) if (bitrates[i].flags & mandatory_flag) mandatory_rates |= BIT(i); return mandatory_rates; } EXPORT_SYMBOL(ieee80211_mandatory_rates); u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) { /* see 802.11 17.3.8.3.2 and Annex J * there are overlapping channel numbers in 5GHz and 2GHz bands */ if (chan <= 0) return 0; /* not supported */ switch (band) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: if (chan == 14) return MHZ_TO_KHZ(2484); else if (chan < 14) return MHZ_TO_KHZ(2407 + chan * 5); break; case NL80211_BAND_5GHZ: if (chan >= 182 && chan <= 196) return MHZ_TO_KHZ(4000 + chan * 5); else return MHZ_TO_KHZ(5000 + chan * 5); break; case NL80211_BAND_6GHZ: /* see 802.11ax D6.1 27.3.23.2 */ if (chan == 2) return MHZ_TO_KHZ(5935); if (chan <= 233) return MHZ_TO_KHZ(5950 + chan * 5); break; case NL80211_BAND_60GHZ: if (chan < 7) return MHZ_TO_KHZ(56160 + chan * 2160); break; case NL80211_BAND_S1GHZ: return 902000 + chan * 500; default: ; } return 0; /* not supported */ } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ freq = KHZ_TO_MHZ(freq); /* see 802.11 17.3.8.3.2 and Annex J */ if (freq == 2484) return 14; else if (freq < 2484) return (freq - 2407) / 5; else if (freq >= 4910 && freq <= 4980) return (freq - 4000) / 5; else if (freq < 5925) return (freq - 5000) / 5; else if (freq == 5935) return 2; else if (freq <= 45000) /* DMG band lower limit */ /* see 802.11ax D6.1 27.3.22.2 */ return (freq - 5950) / 5; else if (freq >= 58320 && freq <= 70200) return (freq - 56160) / 2160; else return 0; } EXPORT_SYMBOL(ieee80211_freq_khz_to_channel); struct ieee80211_channel *ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq) { enum nl80211_band band; struct ieee80211_supported_band *sband; int i; for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { struct ieee80211_channel *chan = &sband->channels[i]; if (ieee80211_channel_to_khz(chan) == freq) return chan; } } return NULL; } EXPORT_SYMBOL(ieee80211_get_channel_khz); static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) { int i, want; switch (sband->band) { case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: want = 3; for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 60 || sband->bitrates[i].bitrate == 120 || sband->bitrates[i].bitrate == 240) { sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_A; want--; } } WARN_ON(want); break; case NL80211_BAND_2GHZ: case NL80211_BAND_LC: want = 7; for (i = 0; i < sband->n_bitrates; i++) { switch (sband->bitrates[i].bitrate) { case 10: case 20: case 55: case 110: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; want--; break; case 60: case 120: case 240: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_G; want--; fallthrough; default: sband->bitrates[i].flags |= IEEE80211_RATE_ERP_G; break; } } WARN_ON(want != 0 && want != 3); break; case NL80211_BAND_60GHZ: /* check for mandatory HT MCS 1..4 */ WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; case NL80211_BAND_S1GHZ: /* Figure 9-589bd: 3 means unsupported, so != 3 means at least * mandatory is ok. */ WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3); break; case NUM_NL80211_BANDS: default: WARN_ON(1); break; } } void ieee80211_set_bitrate_flags(struct wiphy *wiphy) { enum nl80211_band band; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) set_mandatory_flags_band(wiphy->bands[band]); } bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher) { int i; for (i = 0; i < wiphy->n_cipher_suites; i++) if (cipher == wiphy->cipher_suites[i]) return true; return false; } static bool cfg80211_igtk_cipher_supported(struct cfg80211_registered_device *rdev) { struct wiphy *wiphy = &rdev->wiphy; int i; for (i = 0; i < wiphy->n_cipher_suites; i++) { switch (wiphy->cipher_suites[i]) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: return true; } } return false; } bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev, int key_idx, bool pairwise) { int max_key_idx; if (pairwise) max_key_idx = 3; else if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION) || wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; else if (cfg80211_igtk_cipher_supported(rdev)) max_key_idx = 5; else max_key_idx = 3; if (key_idx < 0 || key_idx > max_key_idx) return false; return true; } int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr) { if (!cfg80211_valid_key_idx(rdev, key_idx, pairwise)) return -EINVAL; if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) return -EINVAL; if (pairwise && !mac_addr) return -EINVAL; switch (params->cipher) { case WLAN_CIPHER_SUITE_TKIP: /* Extended Key ID can only be used with CCMP/GCMP ciphers */ if ((pairwise && key_idx) || params->mode != NL80211_KEY_RX_TX) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: /* IEEE802.11-2016 allows only 0 and - when supporting * Extended Key ID - 1 as index for pairwise keys. * @NL80211_KEY_NO_TX is only allowed for pairwise keys when * the driver supports Extended Key ID. * @NL80211_KEY_SET_TX can't be set when installing and * validating a key. */ if ((params->mode == NL80211_KEY_NO_TX && !pairwise) || params->mode == NL80211_KEY_SET_TX) return -EINVAL; if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) { if (pairwise && (key_idx < 0 || key_idx > 1)) return -EINVAL; } else if (pairwise && key_idx) { return -EINVAL; } break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: /* Disallow BIP (group-only) cipher as pairwise cipher */ if (pairwise) return -EINVAL; if (key_idx < 4) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: if (key_idx > 3) return -EINVAL; break; default: break; } switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: if (params->key_len != WLAN_KEY_LEN_WEP40) return -EINVAL; break; case WLAN_CIPHER_SUITE_TKIP: if (params->key_len != WLAN_KEY_LEN_TKIP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: if (params->key_len != WLAN_KEY_LEN_CCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP_256: if (params->key_len != WLAN_KEY_LEN_CCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP: if (params->key_len != WLAN_KEY_LEN_GCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP_256: if (params->key_len != WLAN_KEY_LEN_GCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP104: if (params->key_len != WLAN_KEY_LEN_WEP104) return -EINVAL; break; case WLAN_CIPHER_SUITE_AES_CMAC: if (params->key_len != WLAN_KEY_LEN_AES_CMAC) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_CMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_CMAC_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_128) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_256) return -EINVAL; break; default: /* * We don't know anything about this algorithm, * allow using it -- but the driver must check * all parameters! We still check below whether * or not the driver supports this algorithm, * of course. */ break; } if (params->seq) { switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: /* These ciphers do not use key sequence */ return -EINVAL; case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->seq_len != 6) return -EINVAL; break; } } if (!cfg80211_supported_cipher_suite(&rdev->wiphy, params->cipher)) return -EINVAL; return 0; } unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc) { unsigned int hdrlen = 24; if (ieee80211_is_ext(fc)) { hdrlen = 4; goto out; } if (ieee80211_is_data(fc)) { if (ieee80211_has_a4(fc)) hdrlen = 30; if (ieee80211_is_data_qos(fc)) { hdrlen += IEEE80211_QOS_CTL_LEN; if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; } goto out; } if (ieee80211_is_mgmt(fc)) { if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; goto out; } if (ieee80211_is_ctl(fc)) { /* * ACK and CTS are 10 bytes, all others 16. To see how * to get this condition consider * subtype mask: 0b0000000011110000 (0x00F0) * ACK subtype: 0b0000000011010000 (0x00D0) * CTS subtype: 0b0000000011000000 (0x00C0) * bits that matter: ^^^ (0x00E0) * value of those: 0b0000000011000000 (0x00C0) */ if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0)) hdrlen = 10; else hdrlen = 16; } out: return hdrlen; } EXPORT_SYMBOL(ieee80211_hdrlen); unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) { const struct ieee80211_hdr *hdr = (const struct ieee80211_hdr *)skb->data; unsigned int hdrlen; if (unlikely(skb->len < 10)) return 0; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (unlikely(hdrlen > skb->len)) return 0; return hdrlen; } EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); static unsigned int __ieee80211_get_mesh_hdrlen(u8 flags) { int ae = flags & MESH_FLAGS_AE; /* 802.11-2012, 8.2.4.7.3 */ switch (ae) { default: case 0: return 6; case MESH_FLAGS_AE_A4: return 12; case MESH_FLAGS_AE_A5_A6: return 18; } } unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) { return __ieee80211_get_mesh_hdrlen(meshhdr->flags); } EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto) { const __be16 *hdr_proto = hdr + ETH_ALEN; if (!(ether_addr_equal(hdr, rfc1042_header) && *hdr_proto != htons(ETH_P_AARP) && *hdr_proto != htons(ETH_P_IPX)) && !ether_addr_equal(hdr, bridge_tunnel_header)) return false; *proto = *hdr_proto; return true; } EXPORT_SYMBOL(ieee80211_get_8023_tunnel_proto); int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb) { const void *mesh_addr; struct { struct ethhdr eth; u8 flags; } payload; int hdrlen; int ret; ret = skb_copy_bits(skb, 0, &payload, sizeof(payload)); if (ret) return ret; hdrlen = sizeof(payload.eth) + __ieee80211_get_mesh_hdrlen(payload.flags); if (likely(pskb_may_pull(skb, hdrlen + 8) && ieee80211_get_8023_tunnel_proto(skb->data + hdrlen, &payload.eth.h_proto))) hdrlen += ETH_ALEN + 2; else if (!pskb_may_pull(skb, hdrlen)) return -EINVAL; else payload.eth.h_proto = htons(skb->len - hdrlen); mesh_addr = skb->data + sizeof(payload.eth) + ETH_ALEN; switch (payload.flags & MESH_FLAGS_AE) { case MESH_FLAGS_AE_A4: memcpy(&payload.eth.h_source, mesh_addr, ETH_ALEN); break; case MESH_FLAGS_AE_A5_A6: memcpy(&payload.eth, mesh_addr, 2 * ETH_ALEN); break; default: break; } pskb_pull(skb, hdrlen - sizeof(payload.eth)); memcpy(skb->data, &payload.eth, sizeof(payload.eth)); return 0; } EXPORT_SYMBOL(ieee80211_strip_8023_mesh_hdr); int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, const u8 *addr, enum nl80211_iftype iftype, u8 data_offset, bool is_amsdu) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct { u8 hdr[ETH_ALEN] __aligned(2); __be16 proto; } payload; struct ethhdr tmp; u16 hdrlen; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset; if (skb->len < hdrlen) return -1; /* convert IEEE 802.11 header + possible LLC headers into Ethernet * header * IEEE 802.11 address fields: * ToDS FromDS Addr1 Addr2 Addr3 Addr4 * 0 0 DA SA BSSID n/a * 0 1 DA BSSID SA n/a * 1 0 BSSID SA DA n/a * 1 1 RA TA DA SA */ memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN); memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN); switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case cpu_to_le16(IEEE80211_FCTL_TODS): if (unlikely(iftype != NL80211_IFTYPE_AP && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_P2P_GO)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): if (unlikely(iftype != NL80211_IFTYPE_MESH_POINT && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_STATION)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_FROMDS): if ((iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_P2P_CLIENT && iftype != NL80211_IFTYPE_MESH_POINT) || (is_multicast_ether_addr(tmp.h_dest) && ether_addr_equal(tmp.h_source, addr))) return -1; break; case cpu_to_le16(0): if (iftype != NL80211_IFTYPE_ADHOC && iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_OCB) return -1; break; } if (likely(!is_amsdu && iftype != NL80211_IFTYPE_MESH_POINT && skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 && ieee80211_get_8023_tunnel_proto(&payload, &tmp.h_proto))) { /* remove RFC1042 or Bridge-Tunnel encapsulation */ hdrlen += ETH_ALEN + 2; skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2); } else { tmp.h_proto = htons(skb->len - hdrlen); } pskb_pull(skb, hdrlen); if (!ehdr) ehdr = skb_push(skb, sizeof(struct ethhdr)); memcpy(ehdr, &tmp, sizeof(tmp)); return 0; } EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr); static void __frame_add_frag(struct sk_buff *skb, struct page *page, void *ptr, int len, int size) { struct skb_shared_info *sh = skb_shinfo(skb); int page_offset; get_page(page); page_offset = ptr - page_address(page); skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); } static void __ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, int offset, int len) { struct skb_shared_info *sh = skb_shinfo(skb); const skb_frag_t *frag = &sh->frags[0]; struct page *frag_page; void *frag_ptr; int frag_len, frag_size; int head_size = skb->len - skb->data_len; int cur_len; frag_page = virt_to_head_page(skb->head); frag_ptr = skb->data; frag_size = head_size; while (offset >= frag_size) { offset -= frag_size; frag_page = skb_frag_page(frag); frag_ptr = skb_frag_address(frag); frag_size = skb_frag_size(frag); frag++; } frag_ptr += offset; frag_len = frag_size - offset; cur_len = min(len, frag_len); __frame_add_frag(frame, frag_page, frag_ptr, cur_len, frag_size); len -= cur_len; while (len > 0) { frag_len = skb_frag_size(frag); cur_len = min(len, frag_len); __frame_add_frag(frame, skb_frag_page(frag), skb_frag_address(frag), cur_len, frag_len); len -= cur_len; frag++; } } static struct sk_buff * __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, int offset, int len, bool reuse_frag, int min_len) { struct sk_buff *frame; int cur_len = len; if (skb->len - offset < len) return NULL; /* * When reusing fragments, copy some data to the head to simplify * ethernet header handling and speed up protocol header processing * in the stack later. */ if (reuse_frag) cur_len = min_t(int, len, min_len); /* * Allocate and reserve two bytes more for payload * alignment since sizeof(struct ethhdr) is 14. */ frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len); if (!frame) return NULL; frame->priority = skb->priority; skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); len -= cur_len; if (!len) return frame; offset += cur_len; __ieee80211_amsdu_copy_frag(skb, frame, offset, len); return frame; } static u16 ieee80211_amsdu_subframe_length(void *field, u8 mesh_flags, u8 hdr_type) { __le16 *field_le = field; __be16 *field_be = field; u16 len; if (hdr_type >= 2) len = le16_to_cpu(*field_le); else len = be16_to_cpu(*field_be); if (hdr_type) len += __ieee80211_get_mesh_hdrlen(mesh_flags); return len; } bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr) { int offset = 0, subframe_len, padding; for (offset = 0; offset < skb->len; offset += subframe_len + padding) { int remaining = skb->len - offset; struct { __be16 len; u8 mesh_flags; } hdr; u16 len; if (sizeof(hdr) > remaining) return false; if (skb_copy_bits(skb, offset + 2 * ETH_ALEN, &hdr, sizeof(hdr)) < 0) return false; len = ieee80211_amsdu_subframe_length(&hdr.len, hdr.mesh_flags, mesh_hdr); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; if (subframe_len > remaining) return false; } return true; } EXPORT_SYMBOL(ieee80211_is_valid_amsdu); /* * Detects if an MSDU frame was maliciously converted into an A-MSDU * frame by an adversary. This is done by parsing the received frame * as if it were a regular MSDU, even though the A-MSDU flag is set. * * For non-mesh interfaces, detection involves checking whether the * payload, when interpreted as an MSDU, begins with a valid RFC1042 * header. This is done by comparing the A-MSDU subheader's destination * address to the start of the RFC1042 header. * * For mesh interfaces, the MSDU includes a 6-byte Mesh Control field * and an optional variable-length Mesh Address Extension field before * the RFC1042 header. The position of the RFC1042 header must therefore * be calculated based on the mesh header length. * * Since this function intentionally parses an A-MSDU frame as an MSDU, * it only assumes that the A-MSDU subframe header is present, and * beyond this it performs its own bounds checks under the assumption * that the frame is instead parsed as a non-aggregated MSDU. */ static bool is_amsdu_aggregation_attack(struct ethhdr *eth, struct sk_buff *skb, enum nl80211_iftype iftype) { int offset; /* Non-mesh case can be directly compared */ if (iftype != NL80211_IFTYPE_MESH_POINT) return ether_addr_equal(eth->h_dest, rfc1042_header); offset = __ieee80211_get_mesh_hdrlen(eth->h_dest[0]); if (offset == 6) { /* Mesh case with empty address extension field */ return ether_addr_equal(eth->h_source, rfc1042_header); } else if (offset + ETH_ALEN <= skb->len) { /* Mesh case with non-empty address extension field */ u8 temp[ETH_ALEN]; skb_copy_bits(skb, offset, temp, ETH_ALEN); return ether_addr_equal(temp, rfc1042_header); } return false; } void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, const unsigned int extra_headroom, const u8 *check_da, const u8 *check_sa, u8 mesh_control) { unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; int offset = 0; struct { struct ethhdr eth; uint8_t flags; } hdr; bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); bool reuse_skb = false; bool last = false; int copy_len = sizeof(hdr.eth); if (iftype == NL80211_IFTYPE_MESH_POINT) copy_len = sizeof(hdr); while (!last) { int remaining = skb->len - offset; unsigned int subframe_len; int len, mesh_len = 0; u8 padding; if (copy_len > remaining) goto purge; skb_copy_bits(skb, offset, &hdr, copy_len); if (iftype == NL80211_IFTYPE_MESH_POINT) mesh_len = __ieee80211_get_mesh_hdrlen(hdr.flags); len = ieee80211_amsdu_subframe_length(&hdr.eth.h_proto, hdr.flags, mesh_control); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; /* the last MSDU has no padding */ if (subframe_len > remaining) goto purge; /* mitigate A-MSDU aggregation injection attacks, to be * checked when processing first subframe (offset == 0). */ if (offset == 0 && is_amsdu_aggregation_attack(&hdr.eth, skb, iftype)) goto purge; offset += sizeof(struct ethhdr); last = remaining <= subframe_len + padding; /* FIXME: should we really accept multicast DA? */ if ((check_da && !is_multicast_ether_addr(hdr.eth.h_dest) && !ether_addr_equal(check_da, hdr.eth.h_dest)) || (check_sa && !ether_addr_equal(check_sa, hdr.eth.h_source))) { offset += len + padding; continue; } /* reuse skb for the last subframe */ if (!skb_is_nonlinear(skb) && !reuse_frag && last) { skb_pull(skb, offset); frame = skb; reuse_skb = true; } else { frame = __ieee80211_amsdu_copy(skb, hlen, offset, len, reuse_frag, 32 + mesh_len); if (!frame) goto purge; offset += len + padding; } skb_reset_network_header(frame); frame->dev = skb->dev; frame->priority = skb->priority; if (likely(iftype != NL80211_IFTYPE_MESH_POINT && ieee80211_get_8023_tunnel_proto(frame->data, &hdr.eth.h_proto))) skb_pull(frame, ETH_ALEN + 2); memcpy(skb_push(frame, sizeof(hdr.eth)), &hdr.eth, sizeof(hdr.eth)); __skb_queue_tail(list, frame); } if (!reuse_skb) dev_kfree_skb(skb); return; purge: __skb_queue_purge(list); dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_amsdu_to_8023s); /* Given a data frame determine the 802.1p/1d tag to use. */ unsigned int cfg80211_classify8021d(struct sk_buff *skb, struct cfg80211_qos_map *qos_map) { unsigned int dscp; unsigned char vlan_priority; unsigned int ret; /* skb->priority values from 256->263 are magic values to * directly indicate a specific 802.1d priority. This is used * to allow 802.1d priority to be passed directly in from VLAN * tags, etc. */ if (skb->priority >= 256 && skb->priority <= 263) { ret = skb->priority - 256; goto out; } if (skb_vlan_tag_present(skb)) { vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; if (vlan_priority > 0) { ret = vlan_priority; goto out; } } switch (skb->protocol) { case htons(ETH_P_IP): dscp = ipv4_get_dsfield(ip_hdr(skb)) & 0xfc; break; case htons(ETH_P_IPV6): dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & 0xfc; break; case htons(ETH_P_MPLS_UC): case htons(ETH_P_MPLS_MC): { struct mpls_label mpls_tmp, *mpls; mpls = skb_header_pointer(skb, sizeof(struct ethhdr), sizeof(*mpls), &mpls_tmp); if (!mpls) return 0; ret = (ntohl(mpls->entry) & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; goto out; } case htons(ETH_P_80221): /* 802.21 is always network control traffic */ return 7; default: return 0; } if (qos_map) { unsigned int i, tmp_dscp = dscp >> 2; for (i = 0; i < qos_map->num_des; i++) { if (tmp_dscp == qos_map->dscp_exception[i].dscp) { ret = qos_map->dscp_exception[i].up; goto out; } } for (i = 0; i < 8; i++) { if (tmp_dscp >= qos_map->up[i].low && tmp_dscp <= qos_map->up[i].high) { ret = i; goto out; } } } /* The default mapping as defined Section 2.3 in RFC8325: The three * Most Significant Bits (MSBs) of the DSCP are used as the * corresponding L2 markings. */ ret = dscp >> 5; /* Handle specific DSCP values for which the default mapping (as * described above) doesn't adhere to the intended usage of the DSCP * value. See section 4 in RFC8325. Specifically, for the following * Diffserv Service Classes no update is needed: * - Standard: DF * - Low Priority Data: CS1 * - Multimedia Conferencing: AF41, AF42, AF43 * - Network Control Traffic: CS7 * - Real-Time Interactive: CS4 * - Signaling: CS5 */ switch (dscp >> 2) { case 10: case 12: case 14: /* High throughput data: AF11, AF12, AF13 */ ret = 0; break; case 16: /* Operations, Administration, and Maintenance and Provisioning: * CS2 */ ret = 0; break; case 18: case 20: case 22: /* Low latency data: AF21, AF22, AF23 */ ret = 3; break; case 24: /* Broadcasting video: CS3 */ ret = 4; break; case 26: case 28: case 30: /* Multimedia Streaming: AF31, AF32, AF33 */ ret = 4; break; case 44: /* Voice Admit: VA */ ret = 6; break; case 46: /* Telephony traffic: EF */ ret = 6; break; case 48: /* Network Control Traffic: CS6 */ ret = 7; break; } out: return array_index_nospec(ret, IEEE80211_NUM_TIDS); } EXPORT_SYMBOL(cfg80211_classify8021d); const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id) { const struct cfg80211_bss_ies *ies; ies = rcu_dereference(bss->ies); if (!ies) return NULL; return cfg80211_find_elem(id, ies->data, ies->len); } EXPORT_SYMBOL(ieee80211_bss_get_elem); void cfg80211_upload_connect_keys(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct net_device *dev = wdev->netdev; int i; if (!wdev->connect_keys) return; for (i = 0; i < 4; i++) { if (!wdev->connect_keys->params[i].cipher) continue; if (rdev_add_key(rdev, dev, -1, i, false, NULL, &wdev->connect_keys->params[i])) { netdev_err(dev, "failed to set key %d\n", i); continue; } if (wdev->connect_keys->def == i && rdev_set_default_key(rdev, dev, -1, i, true, true)) { netdev_err(dev, "failed to set defkey %d\n", i); continue; } } kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; } void cfg80211_process_wdev_events(struct wireless_dev *wdev) { struct cfg80211_event *ev; unsigned long flags; spin_lock_irqsave(&wdev->event_lock, flags); while (!list_empty(&wdev->event_list)) { ev = list_first_entry(&wdev->event_list, struct cfg80211_event, list); list_del(&ev->list); spin_unlock_irqrestore(&wdev->event_lock, flags); switch (ev->type) { case EVENT_CONNECT_RESULT: __cfg80211_connect_result( wdev->netdev, &ev->cr, ev->cr.status == WLAN_STATUS_SUCCESS); break; case EVENT_ROAMED: __cfg80211_roamed(wdev, &ev->rm); break; case EVENT_DISCONNECTED: __cfg80211_disconnected(wdev->netdev, ev->dc.ie, ev->dc.ie_len, ev->dc.reason, !ev->dc.locally_generated); break; case EVENT_IBSS_JOINED: __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid, ev->ij.channel); break; case EVENT_STOPPED: cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; case EVENT_PORT_AUTHORIZED: __cfg80211_port_authorized(wdev, ev->pa.peer_addr, ev->pa.td_bitmap, ev->pa.td_bitmap_len); break; } kfree(ev); spin_lock_irqsave(&wdev->event_lock, flags); } spin_unlock_irqrestore(&wdev->event_lock, flags); } void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; lockdep_assert_held(&rdev->wiphy.mtx); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_process_wdev_events(wdev); } int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, struct vif_params *params) { int err; enum nl80211_iftype otype = dev->ieee80211_ptr->iftype; lockdep_assert_held(&rdev->wiphy.mtx); /* don't support changing VLANs, you just re-create them */ if (otype == NL80211_IFTYPE_AP_VLAN) return -EOPNOTSUPP; /* cannot change into P2P device or NAN */ if (ntype == NL80211_IFTYPE_P2P_DEVICE || ntype == NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!rdev->ops->change_virtual_intf || !(rdev->wiphy.interface_modes & (1 << ntype))) return -EOPNOTSUPP; if (ntype != otype) { /* if it's part of a bridge, reject changing type to station/ibss */ if (netif_is_bridge_port(dev) && (ntype == NL80211_IFTYPE_ADHOC || ntype == NL80211_IFTYPE_STATION || ntype == NL80211_IFTYPE_P2P_CLIENT)) return -EBUSY; dev->ieee80211_ptr->use_4addr = false; rdev_set_qos_map(rdev, dev, NULL); switch (otype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev, -1, true); break; case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, dev, false); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); break; case NL80211_IFTYPE_MESH_POINT: /* mesh should be handled? */ break; case NL80211_IFTYPE_OCB: cfg80211_leave_ocb(rdev, dev); break; default: break; } cfg80211_process_rdev_events(rdev); cfg80211_mlme_purge_registrations(dev->ieee80211_ptr); memset(&dev->ieee80211_ptr->u, 0, sizeof(dev->ieee80211_ptr->u)); memset(&dev->ieee80211_ptr->links, 0, sizeof(dev->ieee80211_ptr->links)); } err = rdev_change_virtual_intf(rdev, dev, ntype, params); WARN_ON(!err && dev->ieee80211_ptr->iftype != ntype); if (!err && params && params->use_4addr != -1) dev->ieee80211_ptr->use_4addr = params->use_4addr; if (!err) { dev->priv_flags &= ~IFF_DONT_BRIDGE; switch (ntype) { case NL80211_IFTYPE_STATION: if (dev->ieee80211_ptr->use_4addr) break; fallthrough; case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_ADHOC: dev->priv_flags |= IFF_DONT_BRIDGE; break; case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MESH_POINT: /* bridging OK */ break; case NL80211_IFTYPE_MONITOR: /* monitor can't bridge anyway */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: /* not happening */ break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_NAN: WARN_ON(1); break; } } if (!err && ntype != otype && netif_running(dev)) { cfg80211_update_iface_num(rdev, ntype, 1); cfg80211_update_iface_num(rdev, otype, -1); } return err; } static u32 cfg80211_calculate_bitrate_ht(struct rate_info *rate) { int modulation, streams, bitrate; /* the formula below does only work for MCS values smaller than 32 */ if (WARN_ON_ONCE(rate->mcs >= 32)) return 0; modulation = rate->mcs & 7; streams = (rate->mcs >> 3) + 1; bitrate = (rate->bw == RATE_INFO_BW_40) ? 13500000 : 6500000; if (modulation < 4) bitrate *= (modulation + 1); else if (modulation == 4) bitrate *= (modulation + 2); else bitrate *= (modulation + 3); bitrate *= streams; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; } static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 15400, [7] = 19250, [8] = 23100, [9] = 25025, [10] = 30800, [11] = 38500, [12] = 46200, /* OFDM PHY */ [13] = 6930, [14] = 8662, /* 866.25 mbps */ [15] = 13860, [16] = 17325, [17] = 20790, [18] = 27720, [19] = 34650, [20] = 41580, [21] = 45045, [22] = 51975, [23] = 62370, [24] = 67568, /* 6756.75 mbps */ /* LP-SC PHY */ [25] = 6260, [26] = 8340, [27] = 11120, [28] = 12510, [29] = 16680, [30] = 22240, [31] = 25030, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs]; } static u32 cfg80211_calculate_bitrate_extended_sc_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { [6 - 6] = 26950, /* MCS 9.1 : 2695.0 mbps */ [7 - 6] = 50050, /* MCS 12.1 */ [8 - 6] = 53900, [9 - 6] = 57750, [10 - 6] = 63900, [11 - 6] = 75075, [12 - 6] = 80850, }; /* Extended SC MCS not defined for base MCS below 6 or above 12 */ if (WARN_ON_ONCE(rate->mcs < 6 || rate->mcs > 12)) return 0; return __mcs2bitrate[rate->mcs - 6]; } static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 13475, [7] = 15400, [8] = 19250, [9] = 23100, [10] = 25025, [11] = 26950, [12] = 30800, [13] = 38500, [14] = 46200, [15] = 50050, [16] = 53900, [17] = 57750, [18] = 69300, [19] = 75075, [20] = 80850, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs] * rate->n_bonded_ch; } static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) { static const u32 base[4][12] = { { 6500000, 13000000, 19500000, 26000000, 39000000, 52000000, 58500000, 65000000, 78000000, /* not in the spec, but some devices use this: */ 86700000, 97500000, 108300000, }, { 13500000, 27000000, 40500000, 54000000, 81000000, 108000000, 121500000, 135000000, 162000000, 180000000, 202500000, 225000000, }, { 29300000, 58500000, 87800000, 117000000, 175500000, 234000000, 263300000, 292500000, 351000000, 390000000, 438800000, 487500000, }, { 58500000, 117000000, 175500000, 234000000, 351000000, 468000000, 526500000, 585000000, 702000000, 780000000, 877500000, 975000000, }, }; u32 bitrate; int idx; if (rate->mcs > 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_160: idx = 3; break; case RATE_INFO_BW_80: idx = 2; break; case RATE_INFO_BW_40: idx = 1; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: default: goto warn; case RATE_INFO_BW_20: idx = 0; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) { #define SCALE 6144 u32 mcs_divisors[14] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ }; u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; u32 rates_996[3] = { 480388888, 453700000, 408333333 }; u32 rates_484[3] = { 229411111, 216666666, 195000000 }; u32 rates_242[3] = { 114711111, 108333333, 97500000 }; u32 rates_106[3] = { 40000000, 37777777, 34000000 }; u32 rates_52[3] = { 18820000, 17777777, 16000000 }; u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 13)) return 0; if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->he_ru_alloc > NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; if (rate->bw == RATE_INFO_BW_160 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) result = rates_160M[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996)) result = rates_996[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484)) result = rates_484[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242)) result = rates_242[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106) result = rates_106[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52) result = rates_52[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) result = rates_26[rate->he_gi]; else { WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", rate->bw, rate->he_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); result = tmp; /* and take NSS, DCM into account */ result = (result * rate->nss) / 8; if (rate->he_dcm) result /= 2; return result / 10000; } static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) { #define SCALE 6144 static const u32 mcs_divisors[16] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ 409600, /* 66.666666... */ 204800, /* 33.333333... */ }; static const u32 rates_996[3] = { 480388888, 453700000, 408333333 }; static const u32 rates_484[3] = { 229411111, 216666666, 195000000 }; static const u32 rates_242[3] = { 114711111, 108333333, 97500000 }; static const u32 rates_106[3] = { 40000000, 37777777, 34000000 }; static const u32 rates_52[3] = { 18820000, 17777777, 16000000 }; static const u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 15)) return 0; if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->eht_ru_alloc > NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; /* Bandwidth checks for MCS 14 */ if (rate->mcs == 14) { if ((rate->bw != RATE_INFO_BW_EHT_RU && rate->bw != RATE_INFO_BW_80 && rate->bw != RATE_INFO_BW_160 && rate->bw != RATE_INFO_BW_320) || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_2x996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) { WARN(1, "invalid EHT BW for MCS 14: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } } if (rate->bw == RATE_INFO_BW_320 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) result = 4 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484) result = 3 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996) result = 3 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484) result = 2 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_160 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996)) result = 2 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996)) result = rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484P242) result = rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484)) result = rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_242)) result = rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106P26) result = rates_106[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106) result = rates_106[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52P26) result = rates_52[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52) result = rates_52[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26) result = rates_26[rate->eht_gi]; else { WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); /* and take NSS */ tmp *= rate->nss; do_div(tmp, 8); result = tmp; return result / 10000; } static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate) { /* For 1, 2, 4, 8 and 16 MHz channels */ static const u32 base[5][11] = { { 300000, 600000, 900000, 1200000, 1800000, 2400000, 2700000, 3000000, 3600000, 4000000, /* MCS 10 supported in 1 MHz only */ 150000, }, { 650000, 1300000, 1950000, 2600000, 3900000, 5200000, 5850000, 6500000, 7800000, /* MCS 9 not valid */ }, { 1350000, 2700000, 4050000, 5400000, 8100000, 10800000, 12150000, 13500000, 16200000, 18000000, }, { 2925000, 5850000, 8775000, 11700000, 17550000, 23400000, 26325000, 29250000, 35100000, 39000000, }, { 8580000, 11700000, 17550000, 23400000, 35100000, 46800000, 52650000, 58500000, 70200000, 78000000, }, }; u32 bitrate; /* default is 1 MHz index */ int idx = 0; if (rate->mcs >= 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_16: idx = 4; break; case RATE_INFO_BW_8: idx = 3; break; case RATE_INFO_BW_4: idx = 2; break; case RATE_INFO_BW_2: idx = 1; break; case RATE_INFO_BW_1: idx = 0; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: case RATE_INFO_BW_20: case RATE_INFO_BW_40: case RATE_INFO_BW_80: case RATE_INFO_BW_160: default: goto warn; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) return cfg80211_calculate_bitrate_ht(rate); if (rate->flags & RATE_INFO_FLAGS_DMG) return cfg80211_calculate_bitrate_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EXTENDED_SC_DMG) return cfg80211_calculate_bitrate_extended_sc_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EDMG) return cfg80211_calculate_bitrate_edmg(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); if (rate->flags & RATE_INFO_FLAGS_HE_MCS) return cfg80211_calculate_bitrate_he(rate); if (rate->flags & RATE_INFO_FLAGS_EHT_MCS) return cfg80211_calculate_bitrate_eht(rate); if (rate->flags & RATE_INFO_FLAGS_S1G_MCS) return cfg80211_calculate_bitrate_s1g(rate); return rate->legacy; } EXPORT_SYMBOL(cfg80211_calculate_bitrate); int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, enum ieee80211_p2p_attr_id attr, u8 *buf, unsigned int bufsize) { u8 *out = buf; u16 attr_remaining = 0; bool desired_attr = false; u16 desired_len = 0; while (len > 0) { unsigned int iedatalen; unsigned int copy; const u8 *iedata; if (len < 2) return -EILSEQ; iedatalen = ies[1]; if (iedatalen + 2 > len) return -EILSEQ; if (ies[0] != WLAN_EID_VENDOR_SPECIFIC) goto cont; if (iedatalen < 4) goto cont; iedata = ies + 2; /* check WFA OUI, P2P subtype */ if (iedata[0] != 0x50 || iedata[1] != 0x6f || iedata[2] != 0x9a || iedata[3] != 0x09) goto cont; iedatalen -= 4; iedata += 4; /* check attribute continuation into this IE */ copy = min_t(unsigned int, attr_remaining, iedatalen); if (copy && desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_remaining) return desired_len; } attr_remaining -= copy; if (attr_remaining) goto cont; iedatalen -= copy; iedata += copy; while (iedatalen > 0) { u16 attr_len; /* P2P attribute ID & size must fit */ if (iedatalen < 3) return -EILSEQ; desired_attr = iedata[0] == attr; attr_len = get_unaligned_le16(iedata + 1); iedatalen -= 3; iedata += 3; copy = min_t(unsigned int, attr_len, iedatalen); if (desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_len) return desired_len; } iedata += copy; iedatalen -= copy; attr_remaining = attr_len - copy; } cont: len -= ies[1] + 2; ies += ies[1] + 2; } if (attr_remaining && desired_attr) return -EILSEQ; return -ENOENT; } EXPORT_SYMBOL(cfg80211_get_p2p_attr); static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id, bool id_ext) { int i; /* Make sure array values are legal */ if (WARN_ON(ids[n_ids - 1] == WLAN_EID_EXTENSION)) return false; i = 0; while (i < n_ids) { if (ids[i] == WLAN_EID_EXTENSION) { if (id_ext && (ids[i + 1] == id)) return true; i += 2; continue; } if (ids[i] == id && !id_ext) return true; i++; } return false; } static size_t skip_ie(const u8 *ies, size_t ielen, size_t pos) { /* we assume a validly formed IEs buffer */ u8 len = ies[pos + 1]; pos += 2 + len; /* the IE itself must have 255 bytes for fragments to follow */ if (len < 255) return pos; while (pos < ielen && ies[pos] == WLAN_EID_FRAGMENT) { len = ies[pos + 1]; pos += 2 + len; } return pos; } size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, const u8 *after_ric, int n_after_ric, size_t offset) { size_t pos = offset; while (pos < ielen) { u8 ext = 0; if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(ids, n_ids, ies[pos + ext], ies[pos] == WLAN_EID_EXTENSION)) break; if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) { pos = skip_ie(ies, ielen, pos); while (pos < ielen) { if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; else ext = 0; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(after_ric, n_after_ric, ies[pos + ext], ext == 2)) pos = skip_ie(ies, ielen, pos); else break; } } else { pos = skip_ie(ies, ielen, pos); } } return pos; } EXPORT_SYMBOL(ieee80211_ie_split_ric); void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id) { unsigned int elem_len; if (!len_pos) return; elem_len = skb->data + skb->len - len_pos - 1; while (elem_len > 255) { /* this one is 255 */ *len_pos = 255; /* remaining data gets smaller */ elem_len -= 255; /* make space for the fragment ID/len in SKB */ skb_put(skb, 2); /* shift back the remaining data to place fragment ID/len */ memmove(len_pos + 255 + 3, len_pos + 255 + 1, elem_len); /* place the fragment ID */ len_pos += 255 + 1; *len_pos = frag_id; /* and point to fragment length to update later */ len_pos++; } *len_pos = elem_len; } EXPORT_SYMBOL(ieee80211_fragment_element); bool ieee80211_operating_class_to_band(u8 operating_class, enum nl80211_band *band) { switch (operating_class) { case 112: case 115 ... 127: case 128 ... 130: *band = NL80211_BAND_5GHZ; return true; case 131 ... 135: case 137: *band = NL80211_BAND_6GHZ; return true; case 81: case 82: case 83: case 84: *band = NL80211_BAND_2GHZ; return true; case 180: *band = NL80211_BAND_60GHZ; return true; } return false; } EXPORT_SYMBOL(ieee80211_operating_class_to_band); bool ieee80211_operating_class_to_chandef(u8 operating_class, struct ieee80211_channel *chan, struct cfg80211_chan_def *chandef) { u32 control_freq, offset = 0; enum nl80211_band band; if (!ieee80211_operating_class_to_band(operating_class, &band) || !chan || band != chan->band) return false; control_freq = chan->center_freq; chandef->chan = chan; if (control_freq >= 5955) offset = control_freq - 5955; else if (control_freq >= 5745) offset = control_freq - 5745; else if (control_freq >= 5180) offset = control_freq - 5180; offset /= 20; switch (operating_class) { case 81: /* 2 GHz band; 20 MHz; channels 1..13 */ case 82: /* 2 GHz band; 20 MHz; channel 14 */ case 115: /* 5 GHz band; 20 MHz; channels 36,40,44,48 */ case 118: /* 5 GHz band; 20 MHz; channels 52,56,60,64 */ case 121: /* 5 GHz band; 20 MHz; channels 100..144 */ case 124: /* 5 GHz band; 20 MHz; channels 149,153,157,161 */ case 125: /* 5 GHz band; 20 MHz; channels 149..177 */ case 131: /* 6 GHz band; 20 MHz; channels 1..233*/ case 136: /* 6 GHz band; 20 MHz; channel 2 */ chandef->center_freq1 = control_freq; chandef->width = NL80211_CHAN_WIDTH_20; return true; case 83: /* 2 GHz band; 40 MHz; channels 1..9 */ case 116: /* 5 GHz band; 40 MHz; channels 36,44 */ case 119: /* 5 GHz band; 40 MHz; channels 52,60 */ case 122: /* 5 GHz band; 40 MHz; channels 100,108,116,124,132,140 */ case 126: /* 5 GHz band; 40 MHz; channels 149,157,165,173 */ chandef->center_freq1 = control_freq + 10; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 84: /* 2 GHz band; 40 MHz; channels 5..13 */ case 117: /* 5 GHz band; 40 MHz; channels 40,48 */ case 120: /* 5 GHz band; 40 MHz; channels 56,64 */ case 123: /* 5 GHz band; 40 MHz; channels 104,112,120,128,136,144 */ case 127: /* 5 GHz band; 40 MHz; channels 153,161,169,177 */ chandef->center_freq1 = control_freq - 10; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 132: /* 6 GHz band; 40 MHz; channels 1,5,..,229*/ chandef->center_freq1 = control_freq + 10 - (offset & 1) * 20; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 128: /* 5 GHz band; 80 MHz; channels 36..64,100..144,149..177 */ case 133: /* 6 GHz band; 80 MHz; channels 1,5,..,229 */ chandef->center_freq1 = control_freq + 30 - (offset & 3) * 20; chandef->width = NL80211_CHAN_WIDTH_80; return true; case 129: /* 5 GHz band; 160 MHz; channels 36..64,100..144,149..177 */ case 134: /* 6 GHz band; 160 MHz; channels 1,5,..,229 */ chandef->center_freq1 = control_freq + 70 - (offset & 7) * 20; chandef->width = NL80211_CHAN_WIDTH_160; return true; case 130: /* 5 GHz band; 80+80 MHz; channels 36..64,100..144,149..177 */ case 135: /* 6 GHz band; 80+80 MHz; channels 1,5,..,229 */ /* The center_freq2 of 80+80 MHz is unknown */ case 137: /* 6 GHz band; 320 MHz; channels 1,5,..,229 */ /* 320-1 or 320-2 channelization is unknown */ default: return false; } } EXPORT_SYMBOL(ieee80211_operating_class_to_chandef); bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class) { u8 vht_opclass; u32 freq = chandef->center_freq1; if (freq >= 2412 && freq <= 2472) { if (chandef->width > NL80211_CHAN_WIDTH_40) return false; /* 2.407 GHz, channels 1..13 */ if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 83; /* HT40+ */ else *op_class = 84; /* HT40- */ } else { *op_class = 81; } return true; } if (freq == 2484) { /* channel 14 is only for IEEE 802.11b */ if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT) return false; *op_class = 82; /* channel 14 */ return true; } switch (chandef->width) { case NL80211_CHAN_WIDTH_80: vht_opclass = 128; break; case NL80211_CHAN_WIDTH_160: vht_opclass = 129; break; case NL80211_CHAN_WIDTH_80P80: vht_opclass = 130; break; case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_5: return false; /* unsupported for now */ default: vht_opclass = 0; break; } /* 5 GHz, channels 36..48 */ if (freq >= 5180 && freq <= 5240) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 116; else *op_class = 117; } else { *op_class = 115; } return true; } /* 5 GHz, channels 52..64 */ if (freq >= 5260 && freq <= 5320) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 119; else *op_class = 120; } else { *op_class = 118; } return true; } /* 5 GHz, channels 100..144 */ if (freq >= 5500 && freq <= 5720) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 122; else *op_class = 123; } else { *op_class = 121; } return true; } /* 5 GHz, channels 149..169 */ if (freq >= 5745 && freq <= 5845) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 126; else *op_class = 127; } else if (freq <= 5805) { *op_class = 124; } else { *op_class = 125; } return true; } /* 56.16 GHz, channel 1..4 */ if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 6) { if (chandef->width >= NL80211_CHAN_WIDTH_40) return false; *op_class = 180; return true; } /* not supported yet */ return false; } EXPORT_SYMBOL(ieee80211_chandef_to_operating_class); static int cfg80211_wdev_bi(struct wireless_dev *wdev) { switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: WARN_ON(wdev->valid_links); return wdev->links[0].ap.beacon_interval; case NL80211_IFTYPE_MESH_POINT: return wdev->u.mesh.beacon_interval; case NL80211_IFTYPE_ADHOC: return wdev->u.ibss.beacon_interval; default: break; } return 0; } static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int, u32 *beacon_int_gcd, bool *beacon_int_different, int radio_idx) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; *beacon_int_gcd = 0; *beacon_int_different = false; rdev = wiphy_to_rdev(wiphy); list_for_each_entry(wdev, &wiphy->wdev_list, list) { int wdev_bi; /* this feature isn't supported with MLO */ if (wdev->valid_links) continue; /* skip wdevs not active on the given wiphy radio */ if (radio_idx >= 0 && !(rdev_get_radio_mask(rdev, wdev->netdev) & BIT(radio_idx))) continue; wdev_bi = cfg80211_wdev_bi(wdev); if (!wdev_bi) continue; if (!*beacon_int_gcd) { *beacon_int_gcd = wdev_bi; continue; } if (wdev_bi == *beacon_int_gcd) continue; *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, wdev_bi); } if (new_beacon_int && *beacon_int_gcd != new_beacon_int) { if (*beacon_int_gcd) *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, new_beacon_int); } } int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, u32 beacon_int) { /* * This is just a basic pre-condition check; if interface combinations * are possible the driver must already be checking those with a call * to cfg80211_check_combinations(), in which case we'll validate more * through the cfg80211_calculate_bi_data() call and code in * cfg80211_iter_combinations(). */ if (beacon_int < 10 || beacon_int > 10000) return -EINVAL; return 0; } int cfg80211_iter_combinations(struct wiphy *wiphy, struct iface_combination_params *params, void (*iter)(const struct ieee80211_iface_combination *c, void *data), void *data) { const struct wiphy_radio *radio = NULL; const struct ieee80211_iface_combination *c, *cs; const struct ieee80211_regdomain *regdom; enum nl80211_dfs_regions region = 0; int i, j, n, iftype; int num_interfaces = 0; u32 used_iftypes = 0; u32 beacon_int_gcd; bool beacon_int_different; if (params->radio_idx >= 0) radio = &wiphy->radio[params->radio_idx]; /* * This is a bit strange, since the iteration used to rely only on * the data given by the driver, but here it now relies on context, * in form of the currently operating interfaces. * This is OK for all current users, and saves us from having to * push the GCD calculations into all the drivers. * In the future, this should probably rely more on data that's in * cfg80211 already - the only thing not would appear to be any new * interfaces (while being brought up) and channel/radar data. */ cfg80211_calculate_bi_data(wiphy, params->new_beacon_int, &beacon_int_gcd, &beacon_int_different, params->radio_idx); if (params->radar_detect) { rcu_read_lock(); regdom = rcu_dereference(cfg80211_regdomain); if (regdom) region = regdom->dfs_region; rcu_read_unlock(); } for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { num_interfaces += params->iftype_num[iftype]; if (params->iftype_num[iftype] > 0 && !cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) used_iftypes |= BIT(iftype); } if (radio) { cs = radio->iface_combinations; n = radio->n_iface_combinations; } else { cs = wiphy->iface_combinations; n = wiphy->n_iface_combinations; } for (i = 0; i < n; i++) { struct ieee80211_iface_limit *limits; u32 all_iftypes = 0; c = &cs[i]; if (num_interfaces > c->max_interfaces) continue; if (params->num_different_channels > c->num_different_channels) continue; limits = kmemdup_array(c->limits, c->n_limits, sizeof(*limits), GFP_KERNEL); if (!limits) return -ENOMEM; for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { if (cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) continue; for (j = 0; j < c->n_limits; j++) { all_iftypes |= limits[j].types; if (!(limits[j].types & BIT(iftype))) continue; if (limits[j].max < params->iftype_num[iftype]) goto cont; limits[j].max -= params->iftype_num[iftype]; } } if (params->radar_detect != (c->radar_detect_widths & params->radar_detect)) goto cont; if (params->radar_detect && c->radar_detect_regions && !(c->radar_detect_regions & BIT(region))) goto cont; /* Finally check that all iftypes that we're currently * using are actually part of this combination. If they * aren't then we can't use this combination and have * to continue to the next. */ if ((all_iftypes & used_iftypes) != used_iftypes) goto cont; if (beacon_int_gcd) { if (c->beacon_int_min_gcd && beacon_int_gcd < c->beacon_int_min_gcd) goto cont; if (!c->beacon_int_min_gcd && beacon_int_different) goto cont; } /* This combination covered all interface types and * supported the requested numbers, so we're good. */ (*iter)(c, data); cont: kfree(limits); } return 0; } EXPORT_SYMBOL(cfg80211_iter_combinations); static void cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c, void *data) { int *num = data; (*num)++; } int cfg80211_check_combinations(struct wiphy *wiphy, struct iface_combination_params *params) { int err, num = 0; err = cfg80211_iter_combinations(wiphy, params, cfg80211_iter_sum_ifcombs, &num); if (err) return err; if (num == 0) return -EBUSY; return 0; } EXPORT_SYMBOL(cfg80211_check_combinations); int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, const struct ieee80211_channel *chan) { const struct wiphy_radio *radio; int i, j; u32 freq; if (!chan) return -EINVAL; freq = ieee80211_channel_to_khz(chan); for (i = 0; i < wiphy->n_radio; i++) { radio = &wiphy->radio[i]; for (j = 0; j < radio->n_freq_range; j++) { if (freq >= radio->freq_range[j].start_freq && freq < radio->freq_range[j].end_freq) return i; } } return -EINVAL; } EXPORT_SYMBOL(cfg80211_get_radio_idx_by_chan); int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, const u8 *rates, unsigned int n_rates, u32 *mask) { int i, j; if (!sband) return -EINVAL; if (n_rates == 0 || n_rates > NL80211_MAX_SUPP_RATES) return -EINVAL; *mask = 0; for (i = 0; i < n_rates; i++) { int rate = (rates[i] & 0x7f) * 5; bool found = false; for (j = 0; j < sband->n_bitrates; j++) { if (sband->bitrates[j].bitrate == rate) { found = true; *mask |= BIT(j); break; } } if (!found) return -EINVAL; } /* * mask must have at least one bit set here since we * didn't accept a 0-length rates array nor allowed * entries in the array that didn't exist */ return 0; } unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy) { enum nl80211_band band; unsigned int n_channels = 0; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) n_channels += wiphy->bands[band]->n_channels; return n_channels; } EXPORT_SYMBOL(ieee80211_get_num_supported_channels); int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; wdev = dev->ieee80211_ptr; if (!wdev) return -EOPNOTSUPP; rdev = wiphy_to_rdev(wdev->wiphy); if (!rdev->ops->get_station) return -EOPNOTSUPP; memset(sinfo, 0, sizeof(*sinfo)); guard(wiphy)(&rdev->wiphy); return rdev_get_station(rdev, dev, mac_addr, sinfo); } EXPORT_SYMBOL(cfg80211_get_station); void cfg80211_free_nan_func(struct cfg80211_nan_func *f) { int i; if (!f) return; kfree(f->serv_spec_info); kfree(f->srf_bf); kfree(f->srf_macs); for (i = 0; i < f->num_rx_filters; i++) kfree(f->rx_filters[i].filter); for (i = 0; i < f->num_tx_filters; i++) kfree(f->tx_filters[i].filter); kfree(f->rx_filters); kfree(f->tx_filters); kfree(f); } EXPORT_SYMBOL(cfg80211_free_nan_func); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, u32 center_freq_khz, u32 bw_khz) { u32 start_freq_khz, end_freq_khz; start_freq_khz = center_freq_khz - (bw_khz / 2); end_freq_khz = center_freq_khz + (bw_khz / 2); if (start_freq_khz >= freq_range->start_freq_khz && end_freq_khz <= freq_range->end_freq_khz) return true; return false; } int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo, gfp_t gfp) { link_sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, sizeof(*link_sinfo->pertid), gfp); if (!link_sinfo->pertid) return -ENOMEM; return 0; } EXPORT_SYMBOL(cfg80211_link_sinfo_alloc_tid_stats); int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, sizeof(*(sinfo->pertid)), gfp); if (!sinfo->pertid) return -ENOMEM; return 0; } EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats); /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 }; EXPORT_SYMBOL(rfc1042_header); /* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */ const unsigned char bridge_tunnel_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; EXPORT_SYMBOL(bridge_tunnel_header); /* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ struct iapp_layer2_update { u8 da[ETH_ALEN]; /* broadcast */ u8 sa[ETH_ALEN]; /* STA addr */ __be16 len; /* 6 */ u8 dsap; /* 0 */ u8 ssap; /* 0 */ u8 control; u8 xid_info[3]; } __packed; void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr) { struct iapp_layer2_update *msg; struct sk_buff *skb; /* Send Level 2 Update Frame to update forwarding tables in layer 2 * bridge devices */ skb = dev_alloc_skb(sizeof(*msg)); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ eth_broadcast_addr(msg->da); ether_addr_copy(msg->sa, addr); msg->len = htons(6); msg->dsap = 0; msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ msg->control = 0xaf; /* XID response lsb.1111F101. * F=0 (no poll command; unsolicited frame) */ msg->xid_info[0] = 0x81; /* XID format identifier */ msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ skb->dev = dev; skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); netif_rx(skb); } EXPORT_SYMBOL(cfg80211_send_layer2_update); int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, int mcs, bool ext_nss_bw_capable, unsigned int max_vht_nss) { u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map); int ext_nss_bw; int supp_width; int i, mcs_encoding; if (map == 0xffff) return 0; if (WARN_ON(mcs > 9 || max_vht_nss > 8)) return 0; if (mcs <= 7) mcs_encoding = 0; else if (mcs == 8) mcs_encoding = 1; else mcs_encoding = 2; if (!max_vht_nss) { /* find max_vht_nss for the given MCS */ for (i = 7; i >= 0; i--) { int supp = (map >> (2 * i)) & 3; if (supp == 3) continue; if (supp >= mcs_encoding) { max_vht_nss = i + 1; break; } } } if (!(cap->supp_mcs.tx_mcs_map & cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) return max_vht_nss; ext_nss_bw = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); supp_width = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); /* if not capable, treat ext_nss_bw as 0 */ if (!ext_nss_bw_capable) ext_nss_bw = 0; /* This is invalid */ if (supp_width == 3) return 0; /* This is an invalid combination so pretend nothing is supported */ if (supp_width == 2 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return 0; /* * Cover all the special cases according to IEEE 802.11-2016 * Table 9-250. All other cases are either factor of 1 or not * valid/supported. */ switch (bw) { case IEEE80211_VHT_CHANWIDTH_USE_HT: case IEEE80211_VHT_CHANWIDTH_80MHZ: if ((supp_width == 1 || supp_width == 2) && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_160MHZ: if (supp_width == 0 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: if (supp_width == 0 && ext_nss_bw == 1) return 0; /* not possible */ if (supp_width == 0 && ext_nss_bw == 2) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 0) return 0; /* not possible */ if (supp_width == 1 && ext_nss_bw == 1) return max_vht_nss / 2; if (supp_width == 1 && ext_nss_bw == 2) return (3 * max_vht_nss) / 4; break; } /* not covered or invalid combination received */ return max_vht_nss; } EXPORT_SYMBOL(ieee80211_get_vht_max_nss); bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, bool is_4addr, u8 check_swif) { bool is_vlan = iftype == NL80211_IFTYPE_AP_VLAN; switch (check_swif) { case 0: if (is_vlan && is_4addr) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->interface_modes & BIT(iftype); case 1: if (!(wiphy->software_iftypes & BIT(iftype)) && is_vlan) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->software_iftypes & BIT(iftype); default: break; } return false; } EXPORT_SYMBOL(cfg80211_iftype_allowed); void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); lockdep_assert_wiphy(wdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, wdev->netdev, link_id, true); break; default: /* per-link not relevant */ break; } rdev_del_intf_link(rdev, wdev, link_id); wdev->valid_links &= ~BIT(link_id); eth_zero_addr(wdev->links[link_id].addr); } void cfg80211_remove_links(struct wireless_dev *wdev) { unsigned int link_id; /* * links are controlled by upper layers (userspace/cfg) * only for AP mode, so only remove them here for AP */ if (wdev->iftype != NL80211_IFTYPE_AP) return; if (wdev->valid_links) { for_each_valid_link(wdev, link_id) cfg80211_remove_link(wdev, link_id); } } int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { cfg80211_remove_links(wdev); return rdev_del_virtual_intf(rdev, wdev); } const struct wiphy_iftype_ext_capab * cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type) { int i; for (i = 0; i < wiphy->num_iftype_ext_capab; i++) { if (wiphy->iftype_ext_capab[i].iftype == type) return &wiphy->iftype_ext_capab[i]; } return NULL; } EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa); static bool ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, u32 freq, u32 width) { const struct wiphy_radio_freq_range *r; int i; for (i = 0; i < radio->n_freq_range; i++) { r = &radio->freq_range[i]; if (freq - width / 2 >= r->start_freq && freq + width / 2 <= r->end_freq) return true; } return false; } bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, const struct cfg80211_chan_def *chandef) { u32 freq, width; freq = ieee80211_chandef_to_khz(chandef); width = MHZ_TO_KHZ(cfg80211_chandef_get_width(chandef)); if (!ieee80211_radio_freq_range_valid(radio, freq, width)) return false; freq = MHZ_TO_KHZ(chandef->center_freq2); if (freq && !ieee80211_radio_freq_range_valid(radio, freq, width)) return false; return true; } EXPORT_SYMBOL(cfg80211_radio_chandef_valid); bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, struct ieee80211_channel *chan) { struct wiphy *wiphy = wdev->wiphy; const struct wiphy_radio *radio; struct cfg80211_chan_def chandef; u32 radio_mask; int i; radio_mask = wdev->radio_mask; if (!wiphy->n_radio || radio_mask == BIT(wiphy->n_radio) - 1) return true; cfg80211_chandef_create(&chandef, chan, NL80211_CHAN_HT20); for (i = 0; i < wiphy->n_radio; i++) { if (!(radio_mask & BIT(i))) continue; radio = &wiphy->radio[i]; if (!cfg80211_radio_chandef_valid(radio, &chandef)) continue; return true; } return false; } EXPORT_SYMBOL(cfg80211_wdev_channel_allowed);
1 104 1 1 1 1 1 1 1 77 75 75 77 30 6 6 74 28 1 21 26 25 26 2 2 26 4 4 4 3 4 14 14 14 2 2 2 14 1 1 1 1 1 15 1 15 2 2 2 2 1 1 1 1 2 2 2 2 2 1 1 1 1 1 14 14 13 1 13 14 14 14 14 13 13 1 13 14 14 14 1 10 10 10 11 11 11 11 10 11 11 11 11 11 11 11 11 3 3 26 22 9 8 8 1 9 9 13 11 10 11 8 1 2 2 2 1 1 9 17 12 22 44 44 44 44 44 44 43 26 24 23 3 20 21 1 44 1 44 22 22 26 26 18 20 20 20 20 20 14 6 26 43 6 6 6 6 6 4 4 4 4 4 1 1 13 13 14 14 14 14 14 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 // SPDX-License-Identifier: GPL-2.0-only /* L2TP core. * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd * * This file contains some code of the original L2TPv2 pppol2tp * driver, which has the following copyright: * * Authors: Martijn van Oosterhout <kleptog@svana.org> * James Chapman (jchapman@katalix.com) * Contributors: * Michal Ostrowski <mostrows@speakeasy.net> * Arnaldo Carvalho de Melo <acme@xconectiva.com.br> * David S. Miller (davem@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/string.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/l2tp.h> #include <linux/sort.h> #include <linux/file.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/dst.h> #include <net/ip.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/inet_common.h> #include <net/xfrm.h> #include <net/protocol.h> #include <net/inet6_connection_sock.h> #include <net/inet_ecn.h> #include <net/ip6_route.h> #include <net/ip6_checksum.h> #include <asm/byteorder.h> #include <linux/atomic.h> #include "l2tp_core.h" #define CREATE_TRACE_POINTS #include "trace.h" #define L2TP_DRV_VERSION "V2.0" /* L2TP header constants */ #define L2TP_HDRFLAG_T 0x8000 #define L2TP_HDRFLAG_L 0x4000 #define L2TP_HDRFLAG_S 0x0800 #define L2TP_HDRFLAG_O 0x0200 #define L2TP_HDRFLAG_P 0x0100 #define L2TP_HDR_VER_MASK 0x000F #define L2TP_HDR_VER_2 0x0002 #define L2TP_HDR_VER_3 0x0003 /* L2TPv3 default L2-specific sublayer */ #define L2TP_SLFLAG_S 0x40000000 #define L2TP_SL_SEQ_MASK 0x00ffffff #define L2TP_HDR_SIZE_MAX 14 /* Default trace flags */ #define L2TP_DEFAULT_DEBUG_FLAGS 0 #define L2TP_DEPTH_NESTING 2 #if L2TP_DEPTH_NESTING == SINGLE_DEPTH_NESTING #error "L2TP requires its own lockdep subclass" #endif /* Private data stored for received packets in the skb. */ struct l2tp_skb_cb { u32 ns; u16 has_seq; u16 length; unsigned long expires; }; #define L2TP_SKB_CB(skb) ((struct l2tp_skb_cb *)&(skb)->cb[sizeof(struct inet_skb_parm)]) static struct workqueue_struct *l2tp_wq; /* per-net private data for this module */ static unsigned int l2tp_net_id; struct l2tp_net { /* Lock for write access to l2tp_tunnel_idr */ spinlock_t l2tp_tunnel_idr_lock; struct idr l2tp_tunnel_idr; /* Lock for write access to l2tp_v[23]_session_idr/htable */ spinlock_t l2tp_session_idr_lock; struct idr l2tp_v2_session_idr; struct idr l2tp_v3_session_idr; struct hlist_head l2tp_v3_session_htable[16]; }; static u32 l2tp_v2_session_key(u16 tunnel_id, u16 session_id) { return ((u32)tunnel_id) << 16 | session_id; } static unsigned long l2tp_v3_session_hashkey(struct sock *sk, u32 session_id) { return ((unsigned long)sk) + session_id; } #if IS_ENABLED(CONFIG_IPV6) static bool l2tp_sk_is_v6(struct sock *sk) { return sk->sk_family == PF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_daddr); } #endif static struct l2tp_net *l2tp_pernet(const struct net *net) { return net_generic(net, l2tp_net_id); } static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) { struct sock *sk = tunnel->sock; trace_free_tunnel(tunnel); if (sk) { /* Disable udp encapsulation */ switch (tunnel->encap) { case L2TP_ENCAPTYPE_UDP: /* No longer an encapsulation socket. See net/ipv4/udp.c */ WRITE_ONCE(udp_sk(sk)->encap_type, 0); udp_sk(sk)->encap_rcv = NULL; udp_sk(sk)->encap_destroy = NULL; break; case L2TP_ENCAPTYPE_IP: break; } tunnel->sock = NULL; sock_put(sk); } kfree_rcu(tunnel, rcu); } static void l2tp_session_free(struct l2tp_session *session) { trace_free_session(session); if (session->tunnel) l2tp_tunnel_put(session->tunnel); kfree_rcu(session, rcu); } struct l2tp_tunnel *l2tp_sk_to_tunnel(const struct sock *sk) { const struct net *net = sock_net(sk); unsigned long tunnel_id, tmp; struct l2tp_tunnel *tunnel; struct l2tp_net *pn; rcu_read_lock_bh(); pn = l2tp_pernet(net); idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { if (tunnel && tunnel->sock == sk && refcount_inc_not_zero(&tunnel->ref_count)) { rcu_read_unlock_bh(); return tunnel; } } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_sk_to_tunnel); void l2tp_tunnel_put(struct l2tp_tunnel *tunnel) { if (refcount_dec_and_test(&tunnel->ref_count)) l2tp_tunnel_free(tunnel); } EXPORT_SYMBOL_GPL(l2tp_tunnel_put); void l2tp_session_put(struct l2tp_session *session) { if (refcount_dec_and_test(&session->ref_count)) l2tp_session_free(session); } EXPORT_SYMBOL_GPL(l2tp_session_put); /* Lookup a tunnel. A new reference is held on the returned tunnel. */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) { const struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel; rcu_read_lock_bh(); tunnel = idr_find(&pn->l2tp_tunnel_idr, tunnel_id); if (tunnel && refcount_inc_not_zero(&tunnel->ref_count)) { rcu_read_unlock_bh(); return tunnel; } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_tunnel_get); struct l2tp_tunnel *l2tp_tunnel_get_next(const struct net *net, unsigned long *key) { struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel = NULL; rcu_read_lock_bh(); again: tunnel = idr_get_next_ul(&pn->l2tp_tunnel_idr, key); if (tunnel) { if (refcount_inc_not_zero(&tunnel->ref_count)) { rcu_read_unlock_bh(); return tunnel; } (*key)++; goto again; } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_tunnel_get_next); struct l2tp_session *l2tp_v3_session_get(const struct net *net, struct sock *sk, u32 session_id) { const struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_session *session; rcu_read_lock_bh(); session = idr_find(&pn->l2tp_v3_session_idr, session_id); if (session && !hash_hashed(&session->hlist) && refcount_inc_not_zero(&session->ref_count)) { rcu_read_unlock_bh(); return session; } /* If we get here and session is non-NULL, the session_id * collides with one in another tunnel. If sk is non-NULL, * find the session matching sk. */ if (session && sk) { unsigned long key = l2tp_v3_session_hashkey(sk, session->session_id); hash_for_each_possible_rcu(pn->l2tp_v3_session_htable, session, hlist, key) { /* session->tunnel may be NULL if another thread is in * l2tp_session_register and has added an item to * l2tp_v3_session_htable but hasn't yet added the * session to its tunnel's session_list. */ struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel); if (session->session_id == session_id && tunnel && tunnel->sock == sk && refcount_inc_not_zero(&session->ref_count)) { rcu_read_unlock_bh(); return session; } } } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_v3_session_get); struct l2tp_session *l2tp_v2_session_get(const struct net *net, u16 tunnel_id, u16 session_id) { u32 session_key = l2tp_v2_session_key(tunnel_id, session_id); const struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_session *session; rcu_read_lock_bh(); session = idr_find(&pn->l2tp_v2_session_idr, session_key); if (session && refcount_inc_not_zero(&session->ref_count)) { rcu_read_unlock_bh(); return session; } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_v2_session_get); struct l2tp_session *l2tp_session_get(const struct net *net, struct sock *sk, int pver, u32 tunnel_id, u32 session_id) { if (pver == L2TP_HDR_VER_2) return l2tp_v2_session_get(net, tunnel_id, session_id); else return l2tp_v3_session_get(net, sk, session_id); } EXPORT_SYMBOL_GPL(l2tp_session_get); static struct l2tp_session *l2tp_v2_session_get_next(const struct net *net, u16 tid, unsigned long *key) { struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_session *session = NULL; /* Start searching within the range of the tid */ if (*key == 0) *key = l2tp_v2_session_key(tid, 0); rcu_read_lock_bh(); again: session = idr_get_next_ul(&pn->l2tp_v2_session_idr, key); if (session) { struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel); /* ignore sessions with id 0 as they are internal for pppol2tp */ if (session->session_id == 0) { (*key)++; goto again; } if (tunnel->tunnel_id == tid && refcount_inc_not_zero(&session->ref_count)) { rcu_read_unlock_bh(); return session; } (*key)++; if (tunnel->tunnel_id == tid) goto again; } rcu_read_unlock_bh(); return NULL; } static struct l2tp_session *l2tp_v3_session_get_next(const struct net *net, u32 tid, struct sock *sk, unsigned long *key) { struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_session *session = NULL; rcu_read_lock_bh(); again: session = idr_get_next_ul(&pn->l2tp_v3_session_idr, key); if (session && !hash_hashed(&session->hlist)) { struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel); if (tunnel && tunnel->tunnel_id == tid && refcount_inc_not_zero(&session->ref_count)) { rcu_read_unlock_bh(); return session; } (*key)++; goto again; } /* If we get here and session is non-NULL, the IDR entry may be one * where the session_id collides with one in another tunnel. Check * session_htable for a match. There can only be one session of a given * ID per tunnel so we can return as soon as a match is found. */ if (session && hash_hashed(&session->hlist)) { unsigned long hkey = l2tp_v3_session_hashkey(sk, session->session_id); u32 sid = session->session_id; hash_for_each_possible_rcu(pn->l2tp_v3_session_htable, session, hlist, hkey) { struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel); if (session->session_id == sid && tunnel && tunnel->tunnel_id == tid && refcount_inc_not_zero(&session->ref_count)) { rcu_read_unlock_bh(); return session; } } /* If no match found, the colliding session ID isn't in our * tunnel so try the next session ID. */ (*key)++; goto again; } rcu_read_unlock_bh(); return NULL; } struct l2tp_session *l2tp_session_get_next(const struct net *net, struct sock *sk, int pver, u32 tunnel_id, unsigned long *key) { if (pver == L2TP_HDR_VER_2) return l2tp_v2_session_get_next(net, tunnel_id, key); else return l2tp_v3_session_get_next(net, tunnel_id, sk, key); } EXPORT_SYMBOL_GPL(l2tp_session_get_next); /* Lookup a session by interface name. * This is very inefficient but is only used by management interfaces. */ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname) { struct l2tp_net *pn = l2tp_pernet(net); unsigned long tunnel_id, tmp; struct l2tp_session *session; struct l2tp_tunnel *tunnel; rcu_read_lock_bh(); idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { if (tunnel) { list_for_each_entry_rcu(session, &tunnel->session_list, list) { if (!strcmp(session->ifname, ifname)) { refcount_inc(&session->ref_count); rcu_read_unlock_bh(); return session; } } } } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_session_get_by_ifname); static void l2tp_session_coll_list_add(struct l2tp_session_coll_list *clist, struct l2tp_session *session) { refcount_inc(&session->ref_count); WARN_ON_ONCE(session->coll_list); session->coll_list = clist; spin_lock(&clist->lock); list_add(&session->clist, &clist->list); spin_unlock(&clist->lock); } static int l2tp_session_collision_add(struct l2tp_net *pn, struct l2tp_session *session1, struct l2tp_session *session2) { struct l2tp_session_coll_list *clist; lockdep_assert_held(&pn->l2tp_session_idr_lock); if (!session2) return -EEXIST; /* If existing session is in IP-encap tunnel, refuse new session */ if (session2->tunnel->encap == L2TP_ENCAPTYPE_IP) return -EEXIST; clist = session2->coll_list; if (!clist) { /* First collision. Allocate list to manage the collided sessions * and add the existing session to the list. */ clist = kmalloc(sizeof(*clist), GFP_ATOMIC); if (!clist) return -ENOMEM; spin_lock_init(&clist->lock); INIT_LIST_HEAD(&clist->list); refcount_set(&clist->ref_count, 1); l2tp_session_coll_list_add(clist, session2); } /* If existing session isn't already in the session hlist, add it. */ if (!hash_hashed(&session2->hlist)) hash_add_rcu(pn->l2tp_v3_session_htable, &session2->hlist, session2->hlist_key); /* Add new session to the hlist and collision list */ hash_add_rcu(pn->l2tp_v3_session_htable, &session1->hlist, session1->hlist_key); refcount_inc(&clist->ref_count); l2tp_session_coll_list_add(clist, session1); return 0; } static void l2tp_session_collision_del(struct l2tp_net *pn, struct l2tp_session *session) { struct l2tp_session_coll_list *clist = session->coll_list; unsigned long session_key = session->session_id; struct l2tp_session *session2; lockdep_assert_held(&pn->l2tp_session_idr_lock); hash_del_rcu(&session->hlist); if (clist) { /* Remove session from its collision list. If there * are other sessions with the same ID, replace this * session's IDR entry with that session, otherwise * remove the IDR entry. If this is the last session, * the collision list data is freed. */ spin_lock(&clist->lock); list_del_init(&session->clist); session2 = list_first_entry_or_null(&clist->list, struct l2tp_session, clist); if (session2) { void *old = idr_replace(&pn->l2tp_v3_session_idr, session2, session_key); WARN_ON_ONCE(IS_ERR_VALUE(old)); } else { void *removed = idr_remove(&pn->l2tp_v3_session_idr, session_key); WARN_ON_ONCE(removed != session); } session->coll_list = NULL; spin_unlock(&clist->lock); if (refcount_dec_and_test(&clist->ref_count)) kfree(clist); l2tp_session_put(session); } } int l2tp_session_register(struct l2tp_session *session, struct l2tp_tunnel *tunnel) { struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); struct l2tp_session *other_session = NULL; void *old = NULL; u32 session_key; int err; spin_lock_bh(&tunnel->list_lock); spin_lock_bh(&pn->l2tp_session_idr_lock); if (!tunnel->acpt_newsess) { err = -ENODEV; goto out; } if (tunnel->version == L2TP_HDR_VER_3) { session_key = session->session_id; err = idr_alloc_u32(&pn->l2tp_v3_session_idr, NULL, &session_key, session_key, GFP_ATOMIC); /* IP encap expects session IDs to be globally unique, while * UDP encap doesn't. This isn't per the RFC, which says that * sessions are identified only by the session ID, but is to * support existing userspace which depends on it. */ if (err == -ENOSPC && tunnel->encap == L2TP_ENCAPTYPE_UDP) { other_session = idr_find(&pn->l2tp_v3_session_idr, session_key); err = l2tp_session_collision_add(pn, session, other_session); } } else { session_key = l2tp_v2_session_key(tunnel->tunnel_id, session->session_id); err = idr_alloc_u32(&pn->l2tp_v2_session_idr, NULL, &session_key, session_key, GFP_ATOMIC); } if (err) { if (err == -ENOSPC) err = -EEXIST; goto out; } refcount_inc(&tunnel->ref_count); WRITE_ONCE(session->tunnel, tunnel); list_add_rcu(&session->list, &tunnel->session_list); /* this makes session available to lockless getters */ if (tunnel->version == L2TP_HDR_VER_3) { if (!other_session) old = idr_replace(&pn->l2tp_v3_session_idr, session, session_key); } else { old = idr_replace(&pn->l2tp_v2_session_idr, session, session_key); } /* old should be NULL, unless something removed or modified * the IDR entry after our idr_alloc_32 above (which shouldn't * happen). */ WARN_ON_ONCE(old); out: spin_unlock_bh(&pn->l2tp_session_idr_lock); spin_unlock_bh(&tunnel->list_lock); if (!err) trace_register_session(session); return err; } EXPORT_SYMBOL_GPL(l2tp_session_register); /***************************************************************************** * Receive data handling *****************************************************************************/ /* Queue a skb in order. We come here only if the skb has an L2TP sequence * number. */ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *skb) { struct sk_buff *skbp; struct sk_buff *tmp; u32 ns = L2TP_SKB_CB(skb)->ns; spin_lock_bh(&session->reorder_q.lock); skb_queue_walk_safe(&session->reorder_q, skbp, tmp) { if (L2TP_SKB_CB(skbp)->ns > ns) { __skb_queue_before(&session->reorder_q, skbp, skb); atomic_long_inc(&session->stats.rx_oos_packets); goto out; } } __skb_queue_tail(&session->reorder_q, skb); out: spin_unlock_bh(&session->reorder_q.lock); } /* Dequeue a single skb. */ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff *skb) { struct l2tp_tunnel *tunnel = session->tunnel; int length = L2TP_SKB_CB(skb)->length; /* We're about to requeue the skb, so return resources * to its current owner (a socket receive buffer). */ skb_orphan(skb); atomic_long_inc(&tunnel->stats.rx_packets); atomic_long_add(length, &tunnel->stats.rx_bytes); atomic_long_inc(&session->stats.rx_packets); atomic_long_add(length, &session->stats.rx_bytes); if (L2TP_SKB_CB(skb)->has_seq) { /* Bump our Nr */ session->nr++; session->nr &= session->nr_max; trace_session_seqnum_update(session); } /* call private receive handler */ if (session->recv_skb) (*session->recv_skb)(session, skb, L2TP_SKB_CB(skb)->length); else kfree_skb(skb); } /* Dequeue skbs from the session's reorder_q, subject to packet order. * Skbs that have been in the queue for too long are simply discarded. */ static void l2tp_recv_dequeue(struct l2tp_session *session) { struct sk_buff *skb; struct sk_buff *tmp; /* If the pkt at the head of the queue has the nr that we * expect to send up next, dequeue it and any other * in-sequence packets behind it. */ start: spin_lock_bh(&session->reorder_q.lock); skb_queue_walk_safe(&session->reorder_q, skb, tmp) { struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb); /* If the packet has been pending on the queue for too long, discard it */ if (time_after(jiffies, cb->expires)) { atomic_long_inc(&session->stats.rx_seq_discards); atomic_long_inc(&session->stats.rx_errors); trace_session_pkt_expired(session, cb->ns); session->reorder_skip = 1; __skb_unlink(skb, &session->reorder_q); kfree_skb(skb); continue; } if (cb->has_seq) { if (session->reorder_skip) { session->reorder_skip = 0; session->nr = cb->ns; trace_session_seqnum_reset(session); } if (cb->ns != session->nr) goto out; } __skb_unlink(skb, &session->reorder_q); /* Process the skb. We release the queue lock while we * do so to let other contexts process the queue. */ spin_unlock_bh(&session->reorder_q.lock); l2tp_recv_dequeue_skb(session, skb); goto start; } out: spin_unlock_bh(&session->reorder_q.lock); } static int l2tp_seq_check_rx_window(struct l2tp_session *session, u32 nr) { u32 nws; if (nr >= session->nr) nws = nr - session->nr; else nws = (session->nr_max + 1) - (session->nr - nr); return nws < session->nr_window_size; } /* If packet has sequence numbers, queue it if acceptable. Returns 0 if * acceptable, else non-zero. */ static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb) { struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb); if (!l2tp_seq_check_rx_window(session, cb->ns)) { /* Packet sequence number is outside allowed window. * Discard it. */ trace_session_pkt_outside_rx_window(session, cb->ns); goto discard; } if (session->reorder_timeout != 0) { /* Packet reordering enabled. Add skb to session's * reorder queue, in order of ns. */ l2tp_recv_queue_skb(session, skb); goto out; } /* Packet reordering disabled. Discard out-of-sequence packets, while * tracking the number if in-sequence packets after the first OOS packet * is seen. After nr_oos_count_max in-sequence packets, reset the * sequence number to re-enable packet reception. */ if (cb->ns == session->nr) { skb_queue_tail(&session->reorder_q, skb); } else { u32 nr_oos = cb->ns; u32 nr_next = (session->nr_oos + 1) & session->nr_max; if (nr_oos == nr_next) session->nr_oos_count++; else session->nr_oos_count = 0; session->nr_oos = nr_oos; if (session->nr_oos_count > session->nr_oos_count_max) { session->reorder_skip = 1; } if (!session->reorder_skip) { atomic_long_inc(&session->stats.rx_seq_discards); trace_session_pkt_oos(session, cb->ns); goto discard; } skb_queue_tail(&session->reorder_q, skb); } out: return 0; discard: return 1; } /* Do receive processing of L2TP data frames. We handle both L2TPv2 * and L2TPv3 data frames here. * * L2TPv2 Data Message Header * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |T|L|x|x|S|x|O|P|x|x|x|x| Ver | Length (opt) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Tunnel ID | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ns (opt) | Nr (opt) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Offset Size (opt) | Offset pad... (opt) * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Data frames are marked by T=0. All other fields are the same as * those in L2TP control frames. * * L2TPv3 Data Message Header * * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | L2TP Session Header | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | L2-Specific Sublayer | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Tunnel Payload ... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 Session Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cookie (optional, maximum 64 bits)... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 L2-Specific Sublayer Format * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |x|S|x|x|x|x|x|x| Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Cookie value and sublayer format are negotiated with the peer when * the session is set up. Unlike L2TPv2, we do not need to parse the * packet header to determine if optional fields are present. * * Caller must already have parsed the frame and determined that it is * a data (not control) frame before coming here. Fields up to the * session-id have already been parsed and ptr points to the data * after the session-id. */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length) { struct l2tp_tunnel *tunnel = session->tunnel; int offset; /* Parse and check optional cookie */ if (session->peer_cookie_len > 0) { if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) { pr_debug_ratelimited("%s: cookie mismatch (%u/%u). Discarding.\n", tunnel->name, tunnel->tunnel_id, session->session_id); atomic_long_inc(&session->stats.rx_cookie_discards); goto discard; } ptr += session->peer_cookie_len; } /* Handle the optional sequence numbers. Sequence numbers are * in different places for L2TPv2 and L2TPv3. * * If we are the LAC, enable/disable sequence numbers under * the control of the LNS. If no sequence numbers present but * we were expecting them, discard frame. */ L2TP_SKB_CB(skb)->has_seq = 0; if (tunnel->version == L2TP_HDR_VER_2) { if (hdrflags & L2TP_HDRFLAG_S) { /* Store L2TP info in the skb */ L2TP_SKB_CB(skb)->ns = ntohs(*(__be16 *)ptr); L2TP_SKB_CB(skb)->has_seq = 1; ptr += 2; /* Skip past nr in the header */ ptr += 2; } } else if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) { u32 l2h = ntohl(*(__be32 *)ptr); if (l2h & 0x40000000) { /* Store L2TP info in the skb */ L2TP_SKB_CB(skb)->ns = l2h & 0x00ffffff; L2TP_SKB_CB(skb)->has_seq = 1; } ptr += 4; } if (L2TP_SKB_CB(skb)->has_seq) { /* Received a packet with sequence numbers. If we're the LAC, * check if we sre sending sequence numbers and if not, * configure it so. */ if (!session->lns_mode && !session->send_seq) { trace_session_seqnum_lns_enable(session); session->send_seq = 1; l2tp_session_set_header_len(session, tunnel->version, tunnel->encap); } } else { /* No sequence numbers. * If user has configured mandatory sequence numbers, discard. */ if (session->recv_seq) { pr_debug_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n", session->name); atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } /* If we're the LAC and we're sending sequence numbers, the * LNS has requested that we no longer send sequence numbers. * If we're the LNS and we're sending sequence numbers, the * LAC is broken. Discard the frame. */ if (!session->lns_mode && session->send_seq) { trace_session_seqnum_lns_disable(session); session->send_seq = 0; l2tp_session_set_header_len(session, tunnel->version, tunnel->encap); } else if (session->send_seq) { pr_debug_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n", session->name); atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } } /* Session data offset is defined only for L2TPv2 and is * indicated by an optional 16-bit value in the header. */ if (tunnel->version == L2TP_HDR_VER_2) { /* If offset bit set, skip it. */ if (hdrflags & L2TP_HDRFLAG_O) { offset = ntohs(*(__be16 *)ptr); ptr += 2 + offset; } } offset = ptr - optr; if (!pskb_may_pull(skb, offset)) goto discard; __skb_pull(skb, offset); /* Prepare skb for adding to the session's reorder_q. Hold * packets for max reorder_timeout or 1 second if not * reordering. */ L2TP_SKB_CB(skb)->length = length; L2TP_SKB_CB(skb)->expires = jiffies + (session->reorder_timeout ? session->reorder_timeout : HZ); /* Add packet to the session's receive queue. Reordering is done here, if * enabled. Saved L2TP protocol info is stored in skb->sb[]. */ if (L2TP_SKB_CB(skb)->has_seq) { if (l2tp_recv_data_seq(session, skb)) goto discard; } else { /* No sequence numbers. Add the skb to the tail of the * reorder queue. This ensures that it will be * delivered after all previous sequenced skbs. */ skb_queue_tail(&session->reorder_q, skb); } /* Try to dequeue as many skbs from reorder_q as we can. */ l2tp_recv_dequeue(session); return; discard: atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } EXPORT_SYMBOL_GPL(l2tp_recv_common); /* Drop skbs from the session's reorder_q */ static void l2tp_session_queue_purge(struct l2tp_session *session) { struct sk_buff *skb = NULL; while ((skb = skb_dequeue(&session->reorder_q))) { atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } } /* UDP encapsulation receive handler. See net/ipv4/udp.c for details. */ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct l2tp_session *session = NULL; struct l2tp_tunnel *tunnel = NULL; struct net *net = sock_net(sk); unsigned char *ptr, *optr; u16 hdrflags; u16 version; int length; /* UDP has verified checksum */ /* UDP always verifies the packet length. */ __skb_pull(skb, sizeof(struct udphdr)); /* Short packet? */ if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) goto pass; /* Point to L2TP header */ optr = skb->data; ptr = skb->data; /* Get L2TP header flags */ hdrflags = ntohs(*(__be16 *)ptr); /* Get protocol version */ version = hdrflags & L2TP_HDR_VER_MASK; /* Get length of L2TP packet */ length = skb->len; /* If type is control packet, it is handled by userspace. */ if (hdrflags & L2TP_HDRFLAG_T) goto pass; /* Skip flags */ ptr += 2; if (version == L2TP_HDR_VER_2) { u16 tunnel_id, session_id; /* If length is present, skip it */ if (hdrflags & L2TP_HDRFLAG_L) ptr += 2; /* Extract tunnel and session ID */ tunnel_id = ntohs(*(__be16 *)ptr); ptr += 2; session_id = ntohs(*(__be16 *)ptr); ptr += 2; session = l2tp_v2_session_get(net, tunnel_id, session_id); } else { u32 session_id; ptr += 2; /* skip reserved bits */ session_id = ntohl(*(__be32 *)ptr); ptr += 4; session = l2tp_v3_session_get(net, sk, session_id); } if (!session || !session->recv_skb) { if (session) l2tp_session_put(session); /* Not found? Pass to userspace to deal with */ goto pass; } tunnel = session->tunnel; /* Check protocol version */ if (version != tunnel->version) goto invalid; if (version == L2TP_HDR_VER_3 && l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) { l2tp_session_put(session); goto invalid; } l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); l2tp_session_put(session); return 0; invalid: atomic_long_inc(&tunnel->stats.rx_invalid); pass: /* Put UDP header back */ __skb_push(skb, sizeof(struct udphdr)); return 1; } EXPORT_SYMBOL_GPL(l2tp_udp_encap_recv); /* UDP encapsulation receive error handler. See net/ipv4/udp.c for details. */ static void l2tp_udp_encap_err_recv(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { sk->sk_err = err; sk_error_report(sk); if (ip_hdr(skb)->version == IPVERSION) { if (inet_test_bit(RECVERR, sk)) return ip_icmp_error(sk, skb, err, port, info, payload); #if IS_ENABLED(CONFIG_IPV6) } else { if (inet6_test_bit(RECVERR6, sk)) return ipv6_icmp_error(sk, skb, err, port, info, payload); #endif } } /************************************************************************ * Transmit handling ***********************************************************************/ /* Build an L2TP header for the session into the buffer provided. */ static int l2tp_build_l2tpv2_header(struct l2tp_session *session, void *buf) { struct l2tp_tunnel *tunnel = session->tunnel; __be16 *bufp = buf; __be16 *optr = buf; u16 flags = L2TP_HDR_VER_2; u32 tunnel_id = tunnel->peer_tunnel_id; u32 session_id = session->peer_session_id; if (session->send_seq) flags |= L2TP_HDRFLAG_S; /* Setup L2TP header. */ *bufp++ = htons(flags); *bufp++ = htons(tunnel_id); *bufp++ = htons(session_id); if (session->send_seq) { *bufp++ = htons(session->ns); *bufp++ = 0; session->ns++; session->ns &= 0xffff; trace_session_seqnum_update(session); } return bufp - optr; } static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) { struct l2tp_tunnel *tunnel = session->tunnel; char *bufp = buf; char *optr = bufp; /* Setup L2TP header. The header differs slightly for UDP and * IP encapsulations. For UDP, there is 4 bytes of flags. */ if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { u16 flags = L2TP_HDR_VER_3; *((__be16 *)bufp) = htons(flags); bufp += 2; *((__be16 *)bufp) = 0; bufp += 2; } *((__be32 *)bufp) = htonl(session->peer_session_id); bufp += 4; if (session->cookie_len) { memcpy(bufp, &session->cookie[0], session->cookie_len); bufp += session->cookie_len; } if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) { u32 l2h = 0; if (session->send_seq) { l2h = 0x40000000 | session->ns; session->ns++; session->ns &= 0xffffff; trace_session_seqnum_update(session); } *((__be32 *)bufp) = htonl(l2h); bufp += 4; } return bufp - optr; } /* Queue the packet to IP for output: tunnel socket lock must be held */ static int l2tp_xmit_queue(struct l2tp_tunnel *tunnel, struct sk_buff *skb, struct flowi *fl) { int err; skb->ignore_df = 1; skb_dst_drop(skb); #if IS_ENABLED(CONFIG_IPV6) if (l2tp_sk_is_v6(tunnel->sock)) err = inet6_csk_xmit(tunnel->sock, skb, NULL); else #endif err = ip_queue_xmit(tunnel->sock, skb, fl); return err >= 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, unsigned int *len) { struct l2tp_tunnel *tunnel = session->tunnel; unsigned int data_len = skb->len; struct sock *sk = tunnel->sock; int headroom, uhlen, udp_len; int ret = NET_XMIT_SUCCESS; struct inet_sock *inet; struct udphdr *uh; /* Check that there's enough headroom in the skb to insert IP, * UDP and L2TP headers. If not enough, expand it to * make room. Adjust truesize. */ uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(*uh) : 0; headroom = NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len; if (skb_cow_head(skb, headroom)) { kfree_skb(skb); return NET_XMIT_DROP; } /* Setup L2TP header */ if (tunnel->version == L2TP_HDR_VER_2) l2tp_build_l2tpv2_header(session, __skb_push(skb, session->hdr_len)); else l2tp_build_l2tpv3_header(session, __skb_push(skb, session->hdr_len)); /* Reset skb netfilter state */ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); nf_reset_ct(skb); /* L2TP uses its own lockdep subclass to avoid lockdep splats caused by * nested socket calls on the same lockdep socket class. This can * happen when data from a user socket is routed over l2tp, which uses * another userspace socket. */ spin_lock_nested(&sk->sk_lock.slock, L2TP_DEPTH_NESTING); if (sock_owned_by_user(sk)) { kfree_skb(skb); ret = NET_XMIT_DROP; goto out_unlock; } /* The user-space may change the connection status for the user-space * provided socket at run time: we must check it under the socket lock */ if (tunnel->fd >= 0 && sk->sk_state != TCP_ESTABLISHED) { kfree_skb(skb); ret = NET_XMIT_DROP; goto out_unlock; } /* Report transmitted length before we add encap header, which keeps * statistics consistent for both UDP and IP encap tx/rx paths. */ *len = skb->len; inet = inet_sk(sk); switch (tunnel->encap) { case L2TP_ENCAPTYPE_UDP: /* Setup UDP header */ __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->source = inet->inet_sport; uh->dest = inet->inet_dport; udp_len = uhlen + session->hdr_len + data_len; uh->len = htons(udp_len); /* Calculate UDP checksum if configured to do so */ #if IS_ENABLED(CONFIG_IPV6) if (l2tp_sk_is_v6(sk)) udp6_set_csum(udp_get_no_check6_tx(sk), skb, &inet6_sk(sk)->saddr, &sk->sk_v6_daddr, udp_len); else #endif udp_set_csum(sk->sk_no_check_tx, skb, inet->inet_saddr, inet->inet_daddr, udp_len); break; case L2TP_ENCAPTYPE_IP: break; } ret = l2tp_xmit_queue(tunnel, skb, &inet->cork.fl); out_unlock: spin_unlock(&sk->sk_lock.slock); return ret; } /* If caller requires the skb to have a ppp header, the header must be * inserted in the skb data before calling this function. */ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb) { unsigned int len = 0; int ret; ret = l2tp_xmit_core(session, skb, &len); if (ret == NET_XMIT_SUCCESS) { atomic_long_inc(&session->tunnel->stats.tx_packets); atomic_long_add(len, &session->tunnel->stats.tx_bytes); atomic_long_inc(&session->stats.tx_packets); atomic_long_add(len, &session->stats.tx_bytes); } else { atomic_long_inc(&session->tunnel->stats.tx_errors); atomic_long_inc(&session->stats.tx_errors); } return ret; } EXPORT_SYMBOL_GPL(l2tp_xmit_skb); /***************************************************************************** * Tinnel and session create/destroy. *****************************************************************************/ /* Remove an l2tp session from l2tp_core's lists. */ static void l2tp_session_unhash(struct l2tp_session *session) { struct l2tp_tunnel *tunnel = session->tunnel; if (tunnel) { struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); struct l2tp_session *removed = session; spin_lock_bh(&tunnel->list_lock); spin_lock_bh(&pn->l2tp_session_idr_lock); /* Remove from the per-tunnel list */ list_del_init(&session->list); /* Remove from per-net IDR */ if (tunnel->version == L2TP_HDR_VER_3) { if (hash_hashed(&session->hlist)) l2tp_session_collision_del(pn, session); else removed = idr_remove(&pn->l2tp_v3_session_idr, session->session_id); } else { u32 session_key = l2tp_v2_session_key(tunnel->tunnel_id, session->session_id); removed = idr_remove(&pn->l2tp_v2_session_idr, session_key); } WARN_ON_ONCE(removed && removed != session); spin_unlock_bh(&pn->l2tp_session_idr_lock); spin_unlock_bh(&tunnel->list_lock); } } /* When the tunnel is closed, all the attached sessions need to go too. */ static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) { struct l2tp_session *session; spin_lock_bh(&tunnel->list_lock); tunnel->acpt_newsess = false; list_for_each_entry(session, &tunnel->session_list, list) l2tp_session_delete(session); spin_unlock_bh(&tunnel->list_lock); } /* Tunnel socket destroy hook for UDP encapsulation */ static void l2tp_udp_encap_destroy(struct sock *sk) { struct l2tp_tunnel *tunnel; tunnel = l2tp_sk_to_tunnel(sk); if (tunnel) { l2tp_tunnel_delete(tunnel); l2tp_tunnel_put(tunnel); } } static void l2tp_tunnel_remove(struct net *net, struct l2tp_tunnel *tunnel) { struct l2tp_net *pn = l2tp_pernet(net); spin_lock_bh(&pn->l2tp_tunnel_idr_lock); idr_remove(&pn->l2tp_tunnel_idr, tunnel->tunnel_id); spin_unlock_bh(&pn->l2tp_tunnel_idr_lock); } /* Workqueue tunnel deletion function */ static void l2tp_tunnel_del_work(struct work_struct *work) { struct l2tp_tunnel *tunnel = container_of(work, struct l2tp_tunnel, del_work); struct sock *sk = tunnel->sock; struct socket *sock = sk->sk_socket; l2tp_tunnel_closeall(tunnel); /* If the tunnel socket was created within the kernel, use * the sk API to release it here. */ if (tunnel->fd < 0) { if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } } l2tp_tunnel_remove(tunnel->l2tp_net, tunnel); /* drop initial ref */ l2tp_tunnel_put(tunnel); /* drop workqueue ref */ l2tp_tunnel_put(tunnel); } /* Create a socket for the tunnel, if one isn't set up by * userspace. This is used for static tunnels where there is no * managing L2TP daemon. * * Since we don't want these sockets to keep a namespace alive by * themselves, we drop the socket's namespace refcount after creation. * These sockets are freed when the namespace exits using the pernet * exit hook. */ static int l2tp_tunnel_sock_create(struct net *net, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct socket **sockp) { int err = -EINVAL; struct socket *sock = NULL; struct udp_port_cfg udp_conf; switch (cfg->encap) { case L2TP_ENCAPTYPE_UDP: memset(&udp_conf, 0, sizeof(udp_conf)); #if IS_ENABLED(CONFIG_IPV6) if (cfg->local_ip6 && cfg->peer_ip6) { udp_conf.family = AF_INET6; memcpy(&udp_conf.local_ip6, cfg->local_ip6, sizeof(udp_conf.local_ip6)); memcpy(&udp_conf.peer_ip6, cfg->peer_ip6, sizeof(udp_conf.peer_ip6)); udp_conf.use_udp6_tx_checksums = !cfg->udp6_zero_tx_checksums; udp_conf.use_udp6_rx_checksums = !cfg->udp6_zero_rx_checksums; } else #endif { udp_conf.family = AF_INET; udp_conf.local_ip = cfg->local_ip; udp_conf.peer_ip = cfg->peer_ip; udp_conf.use_udp_checksums = cfg->use_udp_checksums; } udp_conf.local_udp_port = htons(cfg->local_udp_port); udp_conf.peer_udp_port = htons(cfg->peer_udp_port); err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) goto out; break; case L2TP_ENCAPTYPE_IP: #if IS_ENABLED(CONFIG_IPV6) if (cfg->local_ip6 && cfg->peer_ip6) { struct sockaddr_l2tpip6 ip6_addr = {0}; err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; ip6_addr.l2tp_family = AF_INET6; memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = tunnel_id; err = kernel_bind(sock, (struct sockaddr *)&ip6_addr, sizeof(ip6_addr)); if (err < 0) goto out; ip6_addr.l2tp_family = AF_INET6; memcpy(&ip6_addr.l2tp_addr, cfg->peer_ip6, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = peer_tunnel_id; err = kernel_connect(sock, (struct sockaddr *)&ip6_addr, sizeof(ip6_addr), 0); if (err < 0) goto out; } else #endif { struct sockaddr_l2tpip ip_addr = {0}; err = sock_create_kern(net, AF_INET, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; err = kernel_bind(sock, (struct sockaddr *)&ip_addr, sizeof(ip_addr)); if (err < 0) goto out; ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->peer_ip; ip_addr.l2tp_conn_id = peer_tunnel_id; err = kernel_connect(sock, (struct sockaddr *)&ip_addr, sizeof(ip_addr), 0); if (err < 0) goto out; } break; default: goto out; } out: *sockp = sock; if (err < 0 && sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); *sockp = NULL; } return err; } int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp) { struct l2tp_tunnel *tunnel = NULL; int err; enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP; if (cfg) encap = cfg->encap; tunnel = kzalloc(sizeof(*tunnel), GFP_KERNEL); if (!tunnel) { err = -ENOMEM; goto err; } tunnel->version = version; tunnel->tunnel_id = tunnel_id; tunnel->peer_tunnel_id = peer_tunnel_id; sprintf(&tunnel->name[0], "tunl %u", tunnel_id); spin_lock_init(&tunnel->list_lock); tunnel->acpt_newsess = true; INIT_LIST_HEAD(&tunnel->session_list); tunnel->encap = encap; refcount_set(&tunnel->ref_count, 1); tunnel->fd = fd; /* Init delete workqueue struct */ INIT_WORK(&tunnel->del_work, l2tp_tunnel_del_work); err = 0; err: if (tunnelp) *tunnelp = tunnel; return err; } EXPORT_SYMBOL_GPL(l2tp_tunnel_create); static int l2tp_validate_socket(const struct sock *sk, const struct net *net, enum l2tp_encap_type encap) { struct l2tp_tunnel *tunnel; if (!net_eq(sock_net(sk), net)) return -EINVAL; if (sk->sk_type != SOCK_DGRAM) return -EPROTONOSUPPORT; if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) return -EPROTONOSUPPORT; if ((encap == L2TP_ENCAPTYPE_UDP && sk->sk_protocol != IPPROTO_UDP) || (encap == L2TP_ENCAPTYPE_IP && sk->sk_protocol != IPPROTO_L2TP)) return -EPROTONOSUPPORT; if (encap == L2TP_ENCAPTYPE_UDP && sk->sk_user_data) return -EBUSY; tunnel = l2tp_sk_to_tunnel(sk); if (tunnel) { l2tp_tunnel_put(tunnel); return -EBUSY; } return 0; } int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg) { struct l2tp_net *pn = l2tp_pernet(net); u32 tunnel_id = tunnel->tunnel_id; struct socket *sock; struct sock *sk; int ret; spin_lock_bh(&pn->l2tp_tunnel_idr_lock); ret = idr_alloc_u32(&pn->l2tp_tunnel_idr, NULL, &tunnel_id, tunnel_id, GFP_ATOMIC); spin_unlock_bh(&pn->l2tp_tunnel_idr_lock); if (ret) return ret == -ENOSPC ? -EEXIST : ret; if (tunnel->fd < 0) { ret = l2tp_tunnel_sock_create(net, tunnel->tunnel_id, tunnel->peer_tunnel_id, cfg, &sock); if (ret < 0) goto err; } else { sock = sockfd_lookup(tunnel->fd, &ret); if (!sock) goto err; } sk = sock->sk; lock_sock(sk); write_lock_bh(&sk->sk_callback_lock); ret = l2tp_validate_socket(sk, net, tunnel->encap); if (ret < 0) goto err_inval_sock; write_unlock_bh(&sk->sk_callback_lock); if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { struct udp_tunnel_sock_cfg udp_cfg = { .encap_type = UDP_ENCAP_L2TPINUDP, .encap_rcv = l2tp_udp_encap_recv, .encap_err_rcv = l2tp_udp_encap_err_recv, .encap_destroy = l2tp_udp_encap_destroy, }; setup_udp_tunnel_sock(net, sock, &udp_cfg); } sk->sk_allocation = GFP_ATOMIC; release_sock(sk); sock_hold(sk); tunnel->sock = sk; tunnel->l2tp_net = net; spin_lock_bh(&pn->l2tp_tunnel_idr_lock); idr_replace(&pn->l2tp_tunnel_idr, tunnel, tunnel->tunnel_id); spin_unlock_bh(&pn->l2tp_tunnel_idr_lock); trace_register_tunnel(tunnel); if (tunnel->fd >= 0) sockfd_put(sock); return 0; err_inval_sock: write_unlock_bh(&sk->sk_callback_lock); release_sock(sk); if (tunnel->fd < 0) sock_release(sock); else sockfd_put(sock); err: l2tp_tunnel_remove(net, tunnel); return ret; } EXPORT_SYMBOL_GPL(l2tp_tunnel_register); /* This function is used by the netlink TUNNEL_DELETE command. */ void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { if (!test_and_set_bit(0, &tunnel->dead)) { trace_delete_tunnel(tunnel); refcount_inc(&tunnel->ref_count); queue_work(l2tp_wq, &tunnel->del_work); } } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); void l2tp_session_delete(struct l2tp_session *session) { if (!test_and_set_bit(0, &session->dead)) { trace_delete_session(session); refcount_inc(&session->ref_count); queue_work(l2tp_wq, &session->del_work); } } EXPORT_SYMBOL_GPL(l2tp_session_delete); /* Workqueue session deletion function */ static void l2tp_session_del_work(struct work_struct *work) { struct l2tp_session *session = container_of(work, struct l2tp_session, del_work); l2tp_session_unhash(session); l2tp_session_queue_purge(session); if (session->session_close) (*session->session_close)(session); /* drop initial ref */ l2tp_session_put(session); /* drop workqueue ref */ l2tp_session_put(session); } /* We come here whenever a session's send_seq, cookie_len or * l2specific_type parameters are set. */ void l2tp_session_set_header_len(struct l2tp_session *session, int version, enum l2tp_encap_type encap) { if (version == L2TP_HDR_VER_2) { session->hdr_len = 6; if (session->send_seq) session->hdr_len += 4; } else { session->hdr_len = 4 + session->cookie_len; session->hdr_len += l2tp_get_l2specific_len(session); if (encap == L2TP_ENCAPTYPE_UDP) session->hdr_len += 4; } } EXPORT_SYMBOL_GPL(l2tp_session_set_header_len); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) { struct l2tp_session *session; session = kzalloc(sizeof(*session) + priv_size, GFP_KERNEL); if (session) { session->magic = L2TP_SESSION_MAGIC; session->session_id = session_id; session->peer_session_id = peer_session_id; session->nr = 0; if (tunnel->version == L2TP_HDR_VER_2) session->nr_max = 0xffff; else session->nr_max = 0xffffff; session->nr_window_size = session->nr_max / 2; session->nr_oos_count_max = 4; /* Use NR of first received packet */ session->reorder_skip = 1; sprintf(&session->name[0], "sess %u/%u", tunnel->tunnel_id, session->session_id); skb_queue_head_init(&session->reorder_q); session->hlist_key = l2tp_v3_session_hashkey(tunnel->sock, session->session_id); INIT_HLIST_NODE(&session->hlist); INIT_LIST_HEAD(&session->clist); INIT_LIST_HEAD(&session->list); INIT_WORK(&session->del_work, l2tp_session_del_work); if (cfg) { session->pwtype = cfg->pw_type; session->send_seq = cfg->send_seq; session->recv_seq = cfg->recv_seq; session->lns_mode = cfg->lns_mode; session->reorder_timeout = cfg->reorder_timeout; session->l2specific_type = cfg->l2specific_type; session->cookie_len = cfg->cookie_len; memcpy(&session->cookie[0], &cfg->cookie[0], cfg->cookie_len); session->peer_cookie_len = cfg->peer_cookie_len; memcpy(&session->peer_cookie[0], &cfg->peer_cookie[0], cfg->peer_cookie_len); } l2tp_session_set_header_len(session, tunnel->version, tunnel->encap); refcount_set(&session->ref_count, 1); return session; } return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(l2tp_session_create); /***************************************************************************** * Init and cleanup *****************************************************************************/ static __net_init int l2tp_init_net(struct net *net) { struct l2tp_net *pn = net_generic(net, l2tp_net_id); idr_init(&pn->l2tp_tunnel_idr); spin_lock_init(&pn->l2tp_tunnel_idr_lock); idr_init(&pn->l2tp_v2_session_idr); idr_init(&pn->l2tp_v3_session_idr); spin_lock_init(&pn->l2tp_session_idr_lock); return 0; } static __net_exit void l2tp_pre_exit_net(struct net *net) { struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel = NULL; unsigned long tunnel_id, tmp; rcu_read_lock_bh(); idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { if (tunnel) l2tp_tunnel_delete(tunnel); } rcu_read_unlock_bh(); if (l2tp_wq) { /* Run all TUNNEL_DELETE work items just queued. */ __flush_workqueue(l2tp_wq); /* Each TUNNEL_DELETE work item will queue a SESSION_DELETE * work item for each session in the tunnel. Flush the * workqueue again to process these. */ __flush_workqueue(l2tp_wq); } } static int l2tp_idr_item_unexpected(int id, void *p, void *data) { const char *idr_name = data; pr_err("l2tp: %s IDR not empty at net %d exit\n", idr_name, id); WARN_ON_ONCE(1); return 1; } static __net_exit void l2tp_exit_net(struct net *net) { struct l2tp_net *pn = l2tp_pernet(net); /* Our per-net IDRs should be empty. Check that is so, to * help catch cleanup races or refcnt leaks. */ idr_for_each(&pn->l2tp_v2_session_idr, l2tp_idr_item_unexpected, "v2_session"); idr_for_each(&pn->l2tp_v3_session_idr, l2tp_idr_item_unexpected, "v3_session"); idr_for_each(&pn->l2tp_tunnel_idr, l2tp_idr_item_unexpected, "tunnel"); idr_destroy(&pn->l2tp_v2_session_idr); idr_destroy(&pn->l2tp_v3_session_idr); idr_destroy(&pn->l2tp_tunnel_idr); } static struct pernet_operations l2tp_net_ops = { .init = l2tp_init_net, .exit = l2tp_exit_net, .pre_exit = l2tp_pre_exit_net, .id = &l2tp_net_id, .size = sizeof(struct l2tp_net), }; static int __init l2tp_init(void) { int rc = 0; rc = register_pernet_device(&l2tp_net_ops); if (rc) goto out; l2tp_wq = alloc_workqueue("l2tp", WQ_UNBOUND, 0); if (!l2tp_wq) { pr_err("alloc_workqueue failed\n"); unregister_pernet_device(&l2tp_net_ops); rc = -ENOMEM; goto out; } pr_info("L2TP core driver, %s\n", L2TP_DRV_VERSION); out: return rc; } static void __exit l2tp_exit(void) { unregister_pernet_device(&l2tp_net_ops); if (l2tp_wq) { destroy_workqueue(l2tp_wq); l2tp_wq = NULL; } } module_init(l2tp_init); module_exit(l2tp_exit); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP core"); MODULE_LICENSE("GPL"); MODULE_VERSION(L2TP_DRV_VERSION);
16 16 16 15 16 16 16 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020, Nikolay Aleksandrov <nikolay@cumulusnetworks.com> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/ip_tunnels.h> #include "br_private.h" #include "br_private_tunnel.h" static bool __vlan_tun_put(struct sk_buff *skb, const struct net_bridge_vlan *v) { __be32 tid = tunnel_id_to_key32(v->tinfo.tunnel_id); struct nlattr *nest; if (!v->tinfo.tunnel_dst) return true; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_TUNNEL_INFO); if (!nest) return false; if (nla_put_u32(skb, BRIDGE_VLANDB_TINFO_ID, be32_to_cpu(tid))) { nla_nest_cancel(skb, nest); return false; } nla_nest_end(skb, nest); return true; } static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { return (!v_curr->tinfo.tunnel_dst && !range_end->tinfo.tunnel_dst) || vlan_tunid_inrange(v_curr, range_end); } /* check if the options' state of v_curr allow it to enter the range */ bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { u8 range_mc_rtr = br_vlan_multicast_router(range_end); u8 curr_mc_rtr = br_vlan_multicast_router(v_curr); return v_curr->state == range_end->state && __vlan_tun_can_enter_range(v_curr, range_end) && curr_mc_rtr == range_mc_rtr; } bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v, const struct net_bridge_port *p) { if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) || !__vlan_tun_put(skb, v) || nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS, !!(v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED))) return false; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_MCAST_ROUTER, br_vlan_multicast_router(v))) return false; if (p && !br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx) && (nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS, br_multicast_ngroups_get(&v->port_mcast_ctx)) || nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS, br_multicast_ngroups_get_max(&v->port_mcast_ctx)))) return false; #endif return true; } size_t br_vlan_opts_nl_size(void) { return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY_TUNNEL_INFO */ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_TINFO_ID */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_MCAST_ROUTER */ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS */ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS */ #endif + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS */ + 0; } static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *v, u8 state, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge *br; ASSERT_RTNL(); if (state > BR_STATE_BLOCKING) { NL_SET_ERR_MSG_MOD(extack, "Invalid vlan state"); return -EINVAL; } if (br_vlan_is_brentry(v)) br = v->br; else br = v->port->br; if (br->stp_enabled == BR_KERNEL_STP) { NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state when using kernel STP"); return -EBUSY; } if (br_opt_get(br, BROPT_MST_ENABLED)) { NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state directly when MST is enabled"); return -EBUSY; } if (v->state == state) return 0; if (v->vid == br_get_pvid(vg)) br_vlan_set_pvid_state(vg, state); br_vlan_set_state(v, state); *changed = true; return 0; } static const struct nla_policy br_vlandb_tinfo_pol[BRIDGE_VLANDB_TINFO_MAX + 1] = { [BRIDGE_VLANDB_TINFO_ID] = { .type = NLA_U32 }, [BRIDGE_VLANDB_TINFO_CMD] = { .type = NLA_U32 }, }; static int br_vlan_modify_tunnel(const struct net_bridge_port *p, struct net_bridge_vlan *v, struct nlattr **tb, bool *changed, struct netlink_ext_ack *extack) { struct nlattr *tun_tb[BRIDGE_VLANDB_TINFO_MAX + 1], *attr; struct bridge_vlan_info *vinfo; u32 tun_id = 0; int cmd, err; if (!p) { NL_SET_ERR_MSG_MOD(extack, "Can't modify tunnel mapping of non-port vlans"); return -EINVAL; } if (!(p->flags & BR_VLAN_TUNNEL)) { NL_SET_ERR_MSG_MOD(extack, "Port doesn't have tunnel flag set"); return -EINVAL; } attr = tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO]; err = nla_parse_nested(tun_tb, BRIDGE_VLANDB_TINFO_MAX, attr, br_vlandb_tinfo_pol, extack); if (err) return err; if (!tun_tb[BRIDGE_VLANDB_TINFO_CMD]) { NL_SET_ERR_MSG_MOD(extack, "Missing tunnel command attribute"); return -ENOENT; } cmd = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_CMD]); switch (cmd) { case RTM_SETLINK: if (!tun_tb[BRIDGE_VLANDB_TINFO_ID]) { NL_SET_ERR_MSG_MOD(extack, "Missing tunnel id attribute"); return -ENOENT; } /* when working on vlan ranges this is the starting tunnel id */ tun_id = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_ID]); /* vlan info attr is guaranteed by br_vlan_rtm_process_one */ vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); /* tunnel ids are mapped to each vlan in increasing order, * the starting vlan is in BRIDGE_VLANDB_ENTRY_INFO and v is the * current vlan, so we compute: tun_id + v - vinfo->vid */ tun_id += v->vid - vinfo->vid; break; case RTM_DELLINK: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel command"); return -EINVAL; } return br_vlan_tunnel_info(p, cmd, v->vid, tun_id, changed); } static int br_vlan_process_one_opts(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg, struct net_bridge_vlan *v, struct nlattr **tb, bool *changed, struct netlink_ext_ack *extack) { int err; *changed = false; if (tb[BRIDGE_VLANDB_ENTRY_STATE]) { u8 state = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_STATE]); err = br_vlan_modify_state(vg, v, state, changed, extack); if (err) return err; } if (tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO]) { err = br_vlan_modify_tunnel(p, v, tb, changed, extack); if (err) return err; } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]) { u8 val; val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]); err = br_multicast_set_vlan_router(v, val); if (err) return err; *changed = true; } if (tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]) { u32 val; if (!p) { NL_SET_ERR_MSG_MOD(extack, "Can't set mcast_max_groups for non-port vlans"); return -EINVAL; } if (br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx)) { NL_SET_ERR_MSG_MOD(extack, "Multicast snooping disabled on this VLAN"); return -EINVAL; } val = nla_get_u32(tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]); br_multicast_ngroups_set_max(&v->port_mcast_ctx, val); *changed = true; } #endif if (tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]) { bool enabled = v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED; bool val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]); if (!p) { NL_SET_ERR_MSG_MOD(extack, "Can't set neigh_suppress for non-port vlans"); return -EINVAL; } if (val != enabled) { v->priv_flags ^= BR_VLFLAG_NEIGH_SUPPRESS_ENABLED; *changed = true; } } return 0; } int br_vlan_process_options(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan *range_start, struct net_bridge_vlan *range_end, struct nlattr **tb, struct netlink_ext_ack *extack) { struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL; struct net_bridge_vlan_group *vg; int vid, err = 0; u16 pvid; if (p) vg = nbp_vlan_group(p); else vg = br_vlan_group(br); if (!range_start || !br_vlan_should_use(range_start)) { NL_SET_ERR_MSG_MOD(extack, "Vlan range start doesn't exist, can't process options"); return -ENOENT; } if (!range_end || !br_vlan_should_use(range_end)) { NL_SET_ERR_MSG_MOD(extack, "Vlan range end doesn't exist, can't process options"); return -ENOENT; } pvid = br_get_pvid(vg); for (vid = range_start->vid; vid <= range_end->vid; vid++) { bool changed = false; v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) { NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process options"); err = -ENOENT; break; } err = br_vlan_process_one_opts(br, p, vg, v, tb, &changed, extack); if (err) break; if (changed) { /* vlan options changed, check for range */ if (!curr_start) { curr_start = v; curr_end = v; continue; } if (v->vid == pvid || !br_vlan_can_enter_range(v, curr_end)) { br_vlan_notify(br, p, curr_start->vid, curr_end->vid, RTM_NEWVLAN); curr_start = v; } curr_end = v; } else { /* nothing changed and nothing to notify yet */ if (!curr_start) continue; br_vlan_notify(br, p, curr_start->vid, curr_end->vid, RTM_NEWVLAN); curr_start = NULL; curr_end = NULL; } } if (curr_start) br_vlan_notify(br, p, curr_start->vid, curr_end->vid, RTM_NEWVLAN); return err; } bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *r_end) { return v_curr->vid - r_end->vid == 1 && v_curr->msti == r_end->msti && ((v_curr->priv_flags ^ r_end->priv_flags) & BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0 && br_multicast_ctx_options_equal(&v_curr->br_mcast_ctx, &r_end->br_mcast_ctx); } bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts) { struct nlattr *nest2 __maybe_unused; u64 clockval __maybe_unused; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_GLOBAL_OPTIONS); if (!nest) return false; if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_ID, vid)) goto out_err; if (vid_range && vid < vid_range && nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_RANGE, vid_range)) goto out_err; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING, !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) || nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION, v_opts->br_mcast_ctx.multicast_igmp_version) || nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT, v_opts->br_mcast_ctx.multicast_last_member_count) || nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT, v_opts->br_mcast_ctx.multicast_startup_query_count) || nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER, v_opts->br_mcast_ctx.multicast_querier) || br_multicast_dump_querier_state(skb, &v_opts->br_mcast_ctx, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_membership_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_querier_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_response_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_startup_query_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; if (br_rports_have_mc_router(&v_opts->br_mcast_ctx)) { nest2 = nla_nest_start(skb, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS); if (!nest2) goto out_err; rcu_read_lock(); if (br_rports_fill_info(skb, &v_opts->br_mcast_ctx)) { rcu_read_unlock(); nla_nest_cancel(skb, nest2); goto out_err; } rcu_read_unlock(); nla_nest_end(skb, nest2); } #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION, v_opts->br_mcast_ctx.multicast_mld_version)) goto out_err; #endif #endif if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_MSTI, v_opts->msti)) goto out_err; nla_nest_end(skb, nest); return true; out_err: nla_nest_cancel(skb, nest); return false; } static size_t rtnl_vlan_global_opts_nlmsg_size(const struct net_bridge_vlan *v) { return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + nla_total_size(0) /* BRIDGE_VLANDB_GLOBAL_OPTIONS */ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_ID */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */ + br_multicast_querier_state_size() /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE */ + nla_total_size(0) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */ + br_rports_size(&v->br_mcast_ctx) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */ #endif + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_MSTI */ + nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */ } static void br_vlan_global_opts_notify(const struct net_bridge *br, u16 vid, u16 vid_range) { struct net_bridge_vlan *v; struct br_vlan_msg *bvm; struct nlmsghdr *nlh; struct sk_buff *skb; int err = -ENOBUFS; /* right now notifications are done only with rtnl held */ ASSERT_RTNL(); /* need to find the vlan due to flags/options */ v = br_vlan_find(br_vlan_group(br), vid); if (!v) return; skb = nlmsg_new(rtnl_vlan_global_opts_nlmsg_size(v), GFP_KERNEL); if (!skb) goto out_err; err = -EMSGSIZE; nlh = nlmsg_put(skb, 0, 0, RTM_NEWVLAN, sizeof(*bvm), 0); if (!nlh) goto out_err; bvm = nlmsg_data(nlh); memset(bvm, 0, sizeof(*bvm)); bvm->family = AF_BRIDGE; bvm->ifindex = br->dev->ifindex; if (!br_vlan_global_opts_fill(skb, vid, vid_range, v)) goto out_err; nlmsg_end(skb, nlh); rtnl_notify(skb, dev_net(br->dev), 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); return; out_err: rtnl_set_sk_err(dev_net(br->dev), RTNLGRP_BRVLAN, err); kfree_skb(skb); } static int br_vlan_process_global_one_opts(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct net_bridge_vlan *v, struct nlattr **tb, bool *changed, struct netlink_ext_ack *extack) { int err __maybe_unused; *changed = false; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]) { u8 mc_snooping; mc_snooping = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]); if (br_multicast_toggle_global_vlan(v, !!mc_snooping)) *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]) { u8 ver; ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]); err = br_multicast_set_igmp_version(&v->br_mcast_ctx, ver); if (err) return err; *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]) { u32 cnt; cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]); v->br_mcast_ctx.multicast_last_member_count = cnt; *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]) { u32 cnt; cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]); v->br_mcast_ctx.multicast_startup_query_count = cnt; *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]); v->br_mcast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]); v->br_mcast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]); v->br_mcast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]); br_multicast_set_query_intvl(&v->br_mcast_ctx, val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]); v->br_mcast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]); br_multicast_set_startup_query_intvl(&v->br_mcast_ctx, val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]) { u8 val; val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]); err = br_multicast_set_querier(&v->br_mcast_ctx, val); if (err) return err; *changed = true; } #if IS_ENABLED(CONFIG_IPV6) if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) { u8 ver; ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]); err = br_multicast_set_mld_version(&v->br_mcast_ctx, ver); if (err) return err; *changed = true; } #endif #endif if (tb[BRIDGE_VLANDB_GOPTS_MSTI]) { u16 msti; msti = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_MSTI]); err = br_mst_vlan_set_msti(v, msti); if (err) return err; *changed = true; } return 0; } static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = { [BRIDGE_VLANDB_GOPTS_ID] = { .type = NLA_U16 }, [BRIDGE_VLANDB_GOPTS_RANGE] = { .type = NLA_U16 }, [BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING] = { .type = NLA_U8 }, [BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION] = { .type = NLA_U8 }, [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER] = { .type = NLA_U8 }, [BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MSTI] = NLA_POLICY_MAX(NLA_U16, VLAN_N_VID - 1), }; int br_vlan_rtm_process_global_options(struct net_device *dev, const struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL; struct nlattr *tb[BRIDGE_VLANDB_GOPTS_MAX + 1]; struct net_bridge_vlan_group *vg; u16 vid, vid_range = 0; struct net_bridge *br; int err = 0; if (cmd != RTM_NEWVLAN) { NL_SET_ERR_MSG_MOD(extack, "Global vlan options support only set operation"); return -EINVAL; } if (!netif_is_bridge_master(dev)) { NL_SET_ERR_MSG_MOD(extack, "Global vlan options can only be set on bridge device"); return -EINVAL; } br = netdev_priv(dev); vg = br_vlan_group(br); if (WARN_ON(!vg)) return -ENODEV; err = nla_parse_nested(tb, BRIDGE_VLANDB_GOPTS_MAX, attr, br_vlan_db_gpol, extack); if (err) return err; if (!tb[BRIDGE_VLANDB_GOPTS_ID]) { NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry id"); return -EINVAL; } vid = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_ID]); if (!br_vlan_valid_id(vid, extack)) return -EINVAL; if (tb[BRIDGE_VLANDB_GOPTS_RANGE]) { vid_range = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_RANGE]); if (!br_vlan_valid_id(vid_range, extack)) return -EINVAL; if (vid >= vid_range) { NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id"); return -EINVAL; } } else { vid_range = vid; } for (; vid <= vid_range; vid++) { bool changed = false; v = br_vlan_find(vg, vid); if (!v) { NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process global options"); err = -ENOENT; break; } err = br_vlan_process_global_one_opts(br, vg, v, tb, &changed, extack); if (err) break; if (changed) { /* vlan options changed, check for range */ if (!curr_start) { curr_start = v; curr_end = v; continue; } if (!br_vlan_global_opts_can_enter_range(v, curr_end)) { br_vlan_global_opts_notify(br, curr_start->vid, curr_end->vid); curr_start = v; } curr_end = v; } else { /* nothing changed and nothing to notify yet */ if (!curr_start) continue; br_vlan_global_opts_notify(br, curr_start->vid, curr_end->vid); curr_start = NULL; curr_end = NULL; } } if (curr_start) br_vlan_global_opts_notify(br, curr_start->vid, curr_end->vid); return err; }
23 24 24 24 24 23 24 23 1 1 1 23 22 21 21 21 21 23 25 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 39 39 1 53 2 53 23 52 1 53 13 42 39 41 19 1 23 22 53 34 21 21 33 33 21 34 3 34 33 33 33 23 33 23 33 31 33 33 33 33 33 33 23 33 344 344 344 344 344 344 344 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* af_can.c - Protocol family CAN core module * (used by different CAN protocol modules) * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/stddef.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/uaccess.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/can-ml.h> #include <linux/ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include "af_can.h" MODULE_DESCRIPTION("Controller Area Network PF_CAN core"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, " "Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS_NETPROTO(PF_CAN); static int stats_timer __read_mostly = 1; module_param(stats_timer, int, 0444); MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)"); static struct kmem_cache *rcv_cache __read_mostly; /* table of registered CAN protocols */ static const struct can_proto __rcu *proto_tab[CAN_NPROTO] __read_mostly; static DEFINE_MUTEX(proto_tab_lock); static atomic_t skbcounter = ATOMIC_INIT(0); /* af_can socket functions */ void can_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_error_queue); } EXPORT_SYMBOL(can_sock_destruct); static const struct can_proto *can_get_proto(int protocol) { const struct can_proto *cp; rcu_read_lock(); cp = rcu_dereference(proto_tab[protocol]); if (cp && !try_module_get(cp->prot->owner)) cp = NULL; rcu_read_unlock(); return cp; } static inline void can_put_proto(const struct can_proto *cp) { module_put(cp->prot->owner); } static int can_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; const struct can_proto *cp; int err = 0; sock->state = SS_UNCONNECTED; if (protocol < 0 || protocol >= CAN_NPROTO) return -EINVAL; cp = can_get_proto(protocol); #ifdef CONFIG_MODULES if (!cp) { /* try to load protocol module if kernel is modular */ err = request_module("can-proto-%d", protocol); /* In case of error we only print a message but don't * return the error code immediately. Below we will * return -EPROTONOSUPPORT */ if (err) pr_err_ratelimited("can: request_module (can-proto-%d) failed.\n", protocol); cp = can_get_proto(protocol); } #endif /* check for available protocol and correct usage */ if (!cp) return -EPROTONOSUPPORT; if (cp->type != sock->type) { err = -EPROTOTYPE; goto errout; } sock->ops = cp->ops; sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern); if (!sk) { err = -ENOMEM; goto errout; } sock_init_data(sock, sk); sk->sk_destruct = can_sock_destruct; if (sk->sk_prot->init) err = sk->sk_prot->init(sk); if (err) { /* release sk on errors */ sock_orphan(sk); sock_put(sk); sock->sk = NULL; } else { sock_prot_inuse_add(net, sk->sk_prot, 1); } errout: can_put_proto(cp); return err; } /* af_can tx path */ /** * can_send - transmit a CAN frame (optional with local loopback) * @skb: pointer to socket buffer with CAN frame in data section * @loop: loopback for listeners on local CAN sockets (recommended default!) * * Due to the loopback this routine must not be called from hardirq context. * * Return: * 0 on success * -ENETDOWN when the selected interface is down * -ENOBUFS on full driver queue (see net_xmit_errno()) * -ENOMEM when local loopback failed at calling skb_clone() * -EPERM when trying to send on a non-CAN interface * -EMSGSIZE CAN frame size is bigger than CAN interface MTU * -EINVAL when the skb->data does not contain a valid CAN frame */ int can_send(struct sk_buff *skb, int loop) { struct sk_buff *newskb = NULL; struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats; int err = -EINVAL; if (can_is_canxl_skb(skb)) { skb->protocol = htons(ETH_P_CANXL); } else if (can_is_can_skb(skb)) { skb->protocol = htons(ETH_P_CAN); } else if (can_is_canfd_skb(skb)) { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; skb->protocol = htons(ETH_P_CANFD); /* set CAN FD flag for CAN FD frames by default */ cfd->flags |= CANFD_FDF; } else { goto inval_skb; } /* Make sure the CAN frame can pass the selected CAN netdevice. */ if (unlikely(skb->len > READ_ONCE(skb->dev->mtu))) { err = -EMSGSIZE; goto inval_skb; } if (unlikely(skb->dev->type != ARPHRD_CAN)) { err = -EPERM; goto inval_skb; } if (unlikely(!(skb->dev->flags & IFF_UP))) { err = -ENETDOWN; goto inval_skb; } skb->ip_summed = CHECKSUM_UNNECESSARY; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); if (loop) { /* local loopback of sent CAN frames */ /* indication for the CAN driver: do loopback */ skb->pkt_type = PACKET_LOOPBACK; /* The reference to the originating sock may be required * by the receiving socket to check whether the frame is * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS * Therefore we have to ensure that skb->sk remains the * reference to the originating sock by restoring skb->sk * after each skb_clone() or skb_orphan() usage. */ if (!(skb->dev->flags & IFF_ECHO)) { /* If the interface is not capable to do loopback * itself, we do it here. */ newskb = skb_clone(skb, GFP_ATOMIC); if (!newskb) { kfree_skb(skb); return -ENOMEM; } can_skb_set_owner(newskb, skb->sk); newskb->ip_summed = CHECKSUM_UNNECESSARY; newskb->pkt_type = PACKET_BROADCAST; } } else { /* indication for the CAN driver: no loopback required */ skb->pkt_type = PACKET_HOST; } /* send to netdevice */ err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); if (err) { kfree_skb(newskb); return err; } if (newskb) netif_rx(newskb); /* update statistics */ atomic_long_inc(&pkg_stats->tx_frames); atomic_long_inc(&pkg_stats->tx_frames_delta); return 0; inval_skb: kfree_skb(skb); return err; } EXPORT_SYMBOL(can_send); /* af_can rx path */ static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net, struct net_device *dev) { if (dev) { struct can_ml_priv *can_ml = can_get_ml_priv(dev); return &can_ml->dev_rcv_lists; } else { return net->can.rx_alldev_list; } } /** * effhash - hash function for 29 bit CAN identifier reduction * @can_id: 29 bit CAN identifier * * Description: * To reduce the linear traversal in one linked list of _single_ EFF CAN * frame subscriptions the 29 bit identifier is mapped to 10 bits. * (see CAN_EFF_RCV_HASH_BITS definition) * * Return: * Hash value from 0x000 - 0x3FF ( enforced by CAN_EFF_RCV_HASH_BITS mask ) */ static unsigned int effhash(canid_t can_id) { unsigned int hash; hash = can_id; hash ^= can_id >> CAN_EFF_RCV_HASH_BITS; hash ^= can_id >> (2 * CAN_EFF_RCV_HASH_BITS); return hash & ((1 << CAN_EFF_RCV_HASH_BITS) - 1); } /** * can_rcv_list_find - determine optimal filterlist inside device filter struct * @can_id: pointer to CAN identifier of a given can_filter * @mask: pointer to CAN mask of a given can_filter * @dev_rcv_lists: pointer to the device filter struct * * Description: * Returns the optimal filterlist to reduce the filter handling in the * receive path. This function is called by service functions that need * to register or unregister a can_filter in the filter lists. * * A filter matches in general, when * * <received_can_id> & mask == can_id & mask * * so every bit set in the mask (even CAN_EFF_FLAG, CAN_RTR_FLAG) describe * relevant bits for the filter. * * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can * filter for error messages (CAN_ERR_FLAG bit set in mask). For error msg * frames there is a special filterlist and a special rx path filter handling. * * Return: * Pointer to optimal filterlist for the given can_id/mask pair. * Consistency checked mask. * Reduced can_id to have a preprocessed filter compare value. */ static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask, struct can_dev_rcv_lists *dev_rcv_lists) { canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */ /* filter for error message frames in extra filterlist */ if (*mask & CAN_ERR_FLAG) { /* clear CAN_ERR_FLAG in filter entry */ *mask &= CAN_ERR_MASK; return &dev_rcv_lists->rx[RX_ERR]; } /* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */ #define CAN_EFF_RTR_FLAGS (CAN_EFF_FLAG | CAN_RTR_FLAG) /* ensure valid values in can_mask for 'SFF only' frame filtering */ if ((*mask & CAN_EFF_FLAG) && !(*can_id & CAN_EFF_FLAG)) *mask &= (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS); /* reduce condition testing at receive time */ *can_id &= *mask; /* inverse can_id/can_mask filter */ if (inv) return &dev_rcv_lists->rx[RX_INV]; /* mask == 0 => no condition testing at receive time */ if (!(*mask)) return &dev_rcv_lists->rx[RX_ALL]; /* extra filterlists for the subscription of a single non-RTR can_id */ if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) && !(*can_id & CAN_RTR_FLAG)) { if (*can_id & CAN_EFF_FLAG) { if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS)) return &dev_rcv_lists->rx_eff[effhash(*can_id)]; } else { if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS)) return &dev_rcv_lists->rx_sff[*can_id]; } } /* default: filter via can_id/can_mask */ return &dev_rcv_lists->rx[RX_FIL]; } /** * can_rx_register - subscribe CAN frames from a specific interface * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => subscribe from 'all' CAN devices list) * @can_id: CAN identifier (see description) * @mask: CAN mask (see description) * @func: callback function on filter match * @data: returned parameter for callback function * @ident: string for calling module identification * @sk: socket pointer (might be NULL) * * Description: * Invokes the callback function with the received sk_buff and the given * parameter 'data' on a matching receive filter. A filter matches, when * * <received_can_id> & mask == can_id & mask * * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can * filter for error message frames (CAN_ERR_FLAG bit set in mask). * * The provided pointer to the sk_buff is guaranteed to be valid as long as * the callback function is running. The callback function must *not* free * the given sk_buff while processing it's task. When the given sk_buff is * needed after the end of the callback function it must be cloned inside * the callback function with skb_clone(). * * Return: * 0 on success * -ENOMEM on missing cache mem to create subscription entry * -ENODEV unknown device */ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data, char *ident, struct sock *sk) { struct receiver *rcv; struct hlist_head *rcv_list; struct can_dev_rcv_lists *dev_rcv_lists; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; /* insert new receiver (dev,canid,mask) -> (func,data) */ if (dev && (dev->type != ARPHRD_CAN || !can_get_ml_priv(dev))) return -ENODEV; if (dev && !net_eq(net, dev_net(dev))) return -ENODEV; rcv = kmem_cache_alloc(rcv_cache, GFP_KERNEL); if (!rcv) return -ENOMEM; spin_lock_bh(&net->can.rcvlists_lock); dev_rcv_lists = can_dev_rcv_lists_find(net, dev); rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); rcv->can_id = can_id; rcv->mask = mask; rcv->matches = 0; rcv->func = func; rcv->data = data; rcv->ident = ident; rcv->sk = sk; hlist_add_head_rcu(&rcv->list, rcv_list); dev_rcv_lists->entries++; rcv_lists_stats->rcv_entries++; rcv_lists_stats->rcv_entries_max = max(rcv_lists_stats->rcv_entries_max, rcv_lists_stats->rcv_entries); spin_unlock_bh(&net->can.rcvlists_lock); return 0; } EXPORT_SYMBOL(can_rx_register); /* can_rx_delete_receiver - rcu callback for single receiver entry removal */ static void can_rx_delete_receiver(struct rcu_head *rp) { struct receiver *rcv = container_of(rp, struct receiver, rcu); struct sock *sk = rcv->sk; kmem_cache_free(rcv_cache, rcv); if (sk) sock_put(sk); } /** * can_rx_unregister - unsubscribe CAN frames from a specific interface * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => unsubscribe from 'all' CAN devices list) * @can_id: CAN identifier * @mask: CAN mask * @func: callback function on filter match * @data: returned parameter for callback function * * Description: * Removes subscription entry depending on given (subscription) values. */ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data) { struct receiver *rcv = NULL; struct hlist_head *rcv_list; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; struct can_dev_rcv_lists *dev_rcv_lists; if (dev && dev->type != ARPHRD_CAN) return; if (dev && !net_eq(net, dev_net(dev))) return; spin_lock_bh(&net->can.rcvlists_lock); dev_rcv_lists = can_dev_rcv_lists_find(net, dev); rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); /* Search the receiver list for the item to delete. This should * exist, since no receiver may be unregistered that hasn't * been registered before. */ hlist_for_each_entry_rcu(rcv, rcv_list, list) { if (rcv->can_id == can_id && rcv->mask == mask && rcv->func == func && rcv->data == data) break; } /* Check for bugs in CAN protocol implementations using af_can.c: * 'rcv' will be NULL if no matching list item was found for removal. * As this case may potentially happen when closing a socket while * the notifier for removing the CAN netdev is running we just print * a warning here. */ if (!rcv) { pr_warn("can: receive list entry not found for dev %s, id %03X, mask %03X\n", DNAME(dev), can_id, mask); goto out; } hlist_del_rcu(&rcv->list); dev_rcv_lists->entries--; if (rcv_lists_stats->rcv_entries > 0) rcv_lists_stats->rcv_entries--; out: spin_unlock_bh(&net->can.rcvlists_lock); /* schedule the receiver item for deletion */ if (rcv) { if (rcv->sk) sock_hold(rcv->sk); call_rcu(&rcv->rcu, can_rx_delete_receiver); } } EXPORT_SYMBOL(can_rx_unregister); static inline void deliver(struct sk_buff *skb, struct receiver *rcv) { rcv->func(skb, rcv->data); rcv->matches++; } static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buff *skb) { struct receiver *rcv; int matches = 0; struct can_frame *cf = (struct can_frame *)skb->data; canid_t can_id = cf->can_id; if (dev_rcv_lists->entries == 0) return 0; if (can_id & CAN_ERR_FLAG) { /* check for error message frame entries only */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ERR], list) { if (can_id & rcv->mask) { deliver(skb, rcv); matches++; } } return matches; } /* check for unfiltered entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ALL], list) { deliver(skb, rcv); matches++; } /* check for can_id/mask entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_FIL], list) { if ((can_id & rcv->mask) == rcv->can_id) { deliver(skb, rcv); matches++; } } /* check for inverted can_id/mask entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_INV], list) { if ((can_id & rcv->mask) != rcv->can_id) { deliver(skb, rcv); matches++; } } /* check filterlists for single non-RTR can_ids */ if (can_id & CAN_RTR_FLAG) return matches; if (can_id & CAN_EFF_FLAG) { hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_eff[effhash(can_id)], list) { if (rcv->can_id == can_id) { deliver(skb, rcv); matches++; } } } else { can_id &= CAN_SFF_MASK; hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_sff[can_id], list) { deliver(skb, rcv); matches++; } } return matches; } static void can_receive(struct sk_buff *skb, struct net_device *dev) { struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = dev_net(dev); struct can_pkg_stats *pkg_stats = net->can.pkg_stats; int matches; /* update statistics */ atomic_long_inc(&pkg_stats->rx_frames); atomic_long_inc(&pkg_stats->rx_frames_delta); /* create non-zero unique skb identifier together with *skb */ while (!(can_skb_prv(skb)->skbcnt)) can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter); rcu_read_lock(); /* deliver the packet to sockets listening on all devices */ matches = can_rcv_filter(net->can.rx_alldev_list, skb); /* find receive list for this device */ dev_rcv_lists = can_dev_rcv_lists_find(net, dev); matches += can_rcv_filter(dev_rcv_lists, skb); rcu_read_unlock(); /* consume the skbuff allocated by the netdevice driver */ consume_skb(skb); if (matches > 0) { atomic_long_inc(&pkg_stats->matches); atomic_long_inc(&pkg_stats->matches_delta); } } static int can_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb_reason(skb, SKB_DROP_REASON_CAN_RX_INVALID_FRAME); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb_reason(skb, SKB_DROP_REASON_CANFD_RX_INVALID_FRAME); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } static int canxl_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb_reason(skb, SKB_DROP_REASON_CANXL_RX_INVALID_FRAME); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } /* af_can protocol functions */ /** * can_proto_register - register CAN transport protocol * @cp: pointer to CAN protocol structure * * Return: * 0 on success * -EINVAL invalid (out of range) protocol number * -EBUSY protocol already in use * -ENOBUF if proto_register() fails */ int can_proto_register(const struct can_proto *cp) { int proto = cp->protocol; int err = 0; if (proto < 0 || proto >= CAN_NPROTO) { pr_err("can: protocol number %d out of range\n", proto); return -EINVAL; } err = proto_register(cp->prot, 0); if (err < 0) return err; mutex_lock(&proto_tab_lock); if (rcu_access_pointer(proto_tab[proto])) { pr_err("can: protocol %d already registered\n", proto); err = -EBUSY; } else { RCU_INIT_POINTER(proto_tab[proto], cp); } mutex_unlock(&proto_tab_lock); if (err < 0) proto_unregister(cp->prot); return err; } EXPORT_SYMBOL(can_proto_register); /** * can_proto_unregister - unregister CAN transport protocol * @cp: pointer to CAN protocol structure */ void can_proto_unregister(const struct can_proto *cp) { int proto = cp->protocol; mutex_lock(&proto_tab_lock); BUG_ON(rcu_access_pointer(proto_tab[proto]) != cp); RCU_INIT_POINTER(proto_tab[proto], NULL); mutex_unlock(&proto_tab_lock); synchronize_rcu(); proto_unregister(cp->prot); } EXPORT_SYMBOL(can_proto_unregister); static int can_pernet_init(struct net *net) { spin_lock_init(&net->can.rcvlists_lock); net->can.rx_alldev_list = kzalloc(sizeof(*net->can.rx_alldev_list), GFP_KERNEL); if (!net->can.rx_alldev_list) goto out; net->can.pkg_stats = kzalloc(sizeof(*net->can.pkg_stats), GFP_KERNEL); if (!net->can.pkg_stats) goto out_free_rx_alldev_list; net->can.rcv_lists_stats = kzalloc(sizeof(*net->can.rcv_lists_stats), GFP_KERNEL); if (!net->can.rcv_lists_stats) goto out_free_pkg_stats; if (IS_ENABLED(CONFIG_PROC_FS)) { /* the statistics are updated every second (timer triggered) */ if (stats_timer) { timer_setup(&net->can.stattimer, can_stat_update, 0); mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); } net->can.pkg_stats->jiffies_init = jiffies; can_init_proc(net); } return 0; out_free_pkg_stats: kfree(net->can.pkg_stats); out_free_rx_alldev_list: kfree(net->can.rx_alldev_list); out: return -ENOMEM; } static void can_pernet_exit(struct net *net) { if (IS_ENABLED(CONFIG_PROC_FS)) { can_remove_proc(net); if (stats_timer) timer_delete_sync(&net->can.stattimer); } kfree(net->can.rx_alldev_list); kfree(net->can.pkg_stats); kfree(net->can.rcv_lists_stats); } /* af_can module init/exit functions */ static struct packet_type can_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CAN), .func = can_rcv, }; static struct packet_type canfd_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CANFD), .func = canfd_rcv, }; static struct packet_type canxl_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CANXL), .func = canxl_rcv, }; static const struct net_proto_family can_family_ops = { .family = PF_CAN, .create = can_create, .owner = THIS_MODULE, }; static struct pernet_operations can_pernet_ops __read_mostly = { .init = can_pernet_init, .exit = can_pernet_exit, }; static __init int can_init(void) { int err; /* check for correct padding to be able to use the structs similarly */ BUILD_BUG_ON(offsetof(struct can_frame, len) != offsetof(struct canfd_frame, len) || offsetof(struct can_frame, len) != offsetof(struct canxl_frame, flags) || offsetof(struct can_frame, data) != offsetof(struct canfd_frame, data)); pr_info("can: controller area network core\n"); rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver), 0, 0, NULL); if (!rcv_cache) return -ENOMEM; err = register_pernet_subsys(&can_pernet_ops); if (err) goto out_pernet; /* protocol register */ err = sock_register(&can_family_ops); if (err) goto out_sock; dev_add_pack(&can_packet); dev_add_pack(&canfd_packet); dev_add_pack(&canxl_packet); return 0; out_sock: unregister_pernet_subsys(&can_pernet_ops); out_pernet: kmem_cache_destroy(rcv_cache); return err; } static __exit void can_exit(void) { /* protocol unregister */ dev_remove_pack(&canxl_packet); dev_remove_pack(&canfd_packet); dev_remove_pack(&can_packet); sock_unregister(PF_CAN); unregister_pernet_subsys(&can_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(rcv_cache); } module_init(can_init); module_exit(can_exit);
536 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BIO_INTEGRITY_H #define _LINUX_BIO_INTEGRITY_H #include <linux/bio.h> enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */ BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */ BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ BIP_P2P_DMA = 1 << 8, /* using P2P address */ }; struct bio_integrity_payload { struct bvec_iter bip_iter; unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ u16 app_tag; /* application tag value */ struct bio_vec *bip_vec; }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY #define bip_for_each_vec(bvl, bip, iter) \ for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) #define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ for_each_bio(_bio) \ bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { if (bio->bi_opf & REQ_INTEGRITY) return bio->bi_integrity; return NULL; } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); if (bip) return bip->bip_flags & flag; return false; } static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) { return bip->bip_iter.bi_sector; } static inline void bip_set_seed(struct bio_integrity_payload *bip, sector_t seed) { bip->bip_iter.bi_sector = seed; } void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, struct bio_vec *bvecs, unsigned int nr_vecs); struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { return NULL; } static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) { return -EINVAL; } static inline void bio_integrity_unmap_user(struct bio *bio) { } static inline bool bio_integrity_prep(struct bio *bio) { return true; } static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) { return 0; } static inline void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { } static inline void bio_integrity_trim(struct bio *bio) { } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { return false; } static inline struct bio_integrity_payload * bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr) { return ERR_PTR(-EINVAL); } static inline int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { return 0; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* _LINUX_BIO_INTEGRITY_H */
3 3 3 379 380 379 380 380 1 3 2 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/lockdep.h> #include <linux/sysfs.h> #include <linux/kobject.h> #include <linux/memory.h> #include <linux/memory-tiers.h> #include <linux/notifier.h> #include <linux/sched/sysctl.h> #include "internal.h" struct memory_tier { /* hierarchy of memory tiers */ struct list_head list; /* list of all memory types part of this tier */ struct list_head memory_types; /* * start value of abstract distance. memory tier maps * an abstract distance range, * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE */ int adistance_start; struct device dev; /* All the nodes that are part of all the lower memory tiers. */ nodemask_t lower_tier_mask; }; struct demotion_nodes { nodemask_t preferred; }; struct node_memory_type_map { struct memory_dev_type *memtype; int map_count; }; static DEFINE_MUTEX(memory_tier_lock); static LIST_HEAD(memory_tiers); /* * The list is used to store all memory types that are not created * by a device driver. */ static LIST_HEAD(default_memory_types); static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; struct memory_dev_type *default_dram_type; nodemask_t default_dram_nodes __initdata = NODE_MASK_NONE; static const struct bus_type memory_tier_subsys = { .name = "memory_tiering", .dev_name = "memory_tier", }; #ifdef CONFIG_NUMA_BALANCING /** * folio_use_access_time - check if a folio reuses cpupid for page access time * @folio: folio to check * * folio's _last_cpupid field is repurposed by memory tiering. In memory * tiering mode, cpupid of slow memory folio (not toptier memory) is used to * record page access time. * * Return: the folio _last_cpupid is used to record page access time */ bool folio_use_access_time(struct folio *folio) { return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && !node_is_toptier(folio_nid(folio)); } #endif #ifdef CONFIG_MIGRATION static int top_tier_adistance; /* * node_demotion[] examples: * * Example 1: * * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. * * node distances: * node 0 1 2 3 * 0 10 20 30 40 * 1 20 10 40 30 * 2 30 40 10 40 * 3 40 30 40 10 * * memory_tiers0 = 0-1 * memory_tiers1 = 2-3 * * node_demotion[0].preferred = 2 * node_demotion[1].preferred = 3 * node_demotion[2].preferred = <empty> * node_demotion[3].preferred = <empty> * * Example 2: * * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. * * node distances: * node 0 1 2 * 0 10 20 30 * 1 20 10 30 * 2 30 30 10 * * memory_tiers0 = 0-2 * * node_demotion[0].preferred = <empty> * node_demotion[1].preferred = <empty> * node_demotion[2].preferred = <empty> * * Example 3: * * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. * * node distances: * node 0 1 2 * 0 10 20 30 * 1 20 10 40 * 2 30 40 10 * * memory_tiers0 = 1 * memory_tiers1 = 0 * memory_tiers2 = 2 * * node_demotion[0].preferred = 2 * node_demotion[1].preferred = 0 * node_demotion[2].preferred = <empty> * */ static struct demotion_nodes *node_demotion __read_mostly; #endif /* CONFIG_MIGRATION */ static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); /* The lock is used to protect `default_dram_perf*` info and nid. */ static DEFINE_MUTEX(default_dram_perf_lock); static bool default_dram_perf_error; static struct access_coordinate default_dram_perf; static int default_dram_perf_ref_nid = NUMA_NO_NODE; static const char *default_dram_perf_ref_source; static inline struct memory_tier *to_memory_tier(struct device *device) { return container_of(device, struct memory_tier, dev); } static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) { nodemask_t nodes = NODE_MASK_NONE; struct memory_dev_type *memtype; list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) nodes_or(nodes, nodes, memtype->nodes); return nodes; } static void memory_tier_device_release(struct device *dev) { struct memory_tier *tier = to_memory_tier(dev); /* * synchronize_rcu in clear_node_memory_tier makes sure * we don't have rcu access to this memory tier. */ kfree(tier); } static ssize_t nodelist_show(struct device *dev, struct device_attribute *attr, char *buf) { int ret; nodemask_t nmask; mutex_lock(&memory_tier_lock); nmask = get_memtier_nodemask(to_memory_tier(dev)); ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); mutex_unlock(&memory_tier_lock); return ret; } static DEVICE_ATTR_RO(nodelist); static struct attribute *memtier_dev_attrs[] = { &dev_attr_nodelist.attr, NULL }; static const struct attribute_group memtier_dev_group = { .attrs = memtier_dev_attrs, }; static const struct attribute_group *memtier_dev_groups[] = { &memtier_dev_group, NULL }; static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) { int ret; bool found_slot = false; struct memory_tier *memtier, *new_memtier; int adistance = memtype->adistance; unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; lockdep_assert_held_once(&memory_tier_lock); adistance = round_down(adistance, memtier_adistance_chunk_size); /* * If the memtype is already part of a memory tier, * just return that. */ if (!list_empty(&memtype->tier_sibling)) { list_for_each_entry(memtier, &memory_tiers, list) { if (adistance == memtier->adistance_start) return memtier; } WARN_ON(1); return ERR_PTR(-EINVAL); } list_for_each_entry(memtier, &memory_tiers, list) { if (adistance == memtier->adistance_start) { goto link_memtype; } else if (adistance < memtier->adistance_start) { found_slot = true; break; } } new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); if (!new_memtier) return ERR_PTR(-ENOMEM); new_memtier->adistance_start = adistance; INIT_LIST_HEAD(&new_memtier->list); INIT_LIST_HEAD(&new_memtier->memory_types); if (found_slot) list_add_tail(&new_memtier->list, &memtier->list); else list_add_tail(&new_memtier->list, &memory_tiers); new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; new_memtier->dev.bus = &memory_tier_subsys; new_memtier->dev.release = memory_tier_device_release; new_memtier->dev.groups = memtier_dev_groups; ret = device_register(&new_memtier->dev); if (ret) { list_del(&new_memtier->list); put_device(&new_memtier->dev); return ERR_PTR(ret); } memtier = new_memtier; link_memtype: list_add(&memtype->tier_sibling, &memtier->memory_types); return memtier; } static struct memory_tier *__node_get_memory_tier(int node) { pg_data_t *pgdat; pgdat = NODE_DATA(node); if (!pgdat) return NULL; /* * Since we hold memory_tier_lock, we can avoid * RCU read locks when accessing the details. No * parallel updates are possible here. */ return rcu_dereference_check(pgdat->memtier, lockdep_is_held(&memory_tier_lock)); } #ifdef CONFIG_MIGRATION bool node_is_toptier(int node) { bool toptier; pg_data_t *pgdat; struct memory_tier *memtier; pgdat = NODE_DATA(node); if (!pgdat) return false; rcu_read_lock(); memtier = rcu_dereference(pgdat->memtier); if (!memtier) { toptier = true; goto out; } if (memtier->adistance_start <= top_tier_adistance) toptier = true; else toptier = false; out: rcu_read_unlock(); return toptier; } void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) { struct memory_tier *memtier; /* * pg_data_t.memtier updates includes a synchronize_rcu() * which ensures that we either find NULL or a valid memtier * in NODE_DATA. protect the access via rcu_read_lock(); */ rcu_read_lock(); memtier = rcu_dereference(pgdat->memtier); if (memtier) *targets = memtier->lower_tier_mask; else *targets = NODE_MASK_NONE; rcu_read_unlock(); } /** * next_demotion_node() - Get the next node in the demotion path * @node: The starting node to lookup the next node * * Return: node id for next memory node in the demotion path hierarchy * from @node; NUMA_NO_NODE if @node is terminal. This does not keep * @node online or guarantee that it *continues* to be the next demotion * target. */ int next_demotion_node(int node) { struct demotion_nodes *nd; int target; if (!node_demotion) return NUMA_NO_NODE; nd = &node_demotion[node]; /* * node_demotion[] is updated without excluding this * function from running. * * Make sure to use RCU over entire code blocks if * node_demotion[] reads need to be consistent. */ rcu_read_lock(); /* * If there are multiple target nodes, just select one * target node randomly. * * In addition, we can also use round-robin to select * target node, but we should introduce another variable * for node_demotion[] to record last selected target node, * that may cause cache ping-pong due to the changing of * last target node. Or introducing per-cpu data to avoid * caching issue, which seems more complicated. So selecting * target node randomly seems better until now. */ target = node_random(&nd->preferred); rcu_read_unlock(); return target; } static void disable_all_demotion_targets(void) { struct memory_tier *memtier; int node; for_each_node_state(node, N_MEMORY) { node_demotion[node].preferred = NODE_MASK_NONE; /* * We are holding memory_tier_lock, it is safe * to access pgda->memtier. */ memtier = __node_get_memory_tier(node); if (memtier) memtier->lower_tier_mask = NODE_MASK_NONE; } /* * Ensure that the "disable" is visible across the system. * Readers will see either a combination of before+disable * state or disable+after. They will never see before and * after state together. */ synchronize_rcu(); } static void dump_demotion_targets(void) { int node; for_each_node_state(node, N_MEMORY) { struct memory_tier *memtier = __node_get_memory_tier(node); nodemask_t preferred = node_demotion[node].preferred; if (!memtier) continue; if (nodes_empty(preferred)) pr_info("Demotion targets for Node %d: null\n", node); else pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", node, nodemask_pr_args(&preferred), nodemask_pr_args(&memtier->lower_tier_mask)); } } /* * Find an automatic demotion target for all memory * nodes. Failing here is OK. It might just indicate * being at the end of a chain. */ static void establish_demotion_targets(void) { struct memory_tier *memtier; struct demotion_nodes *nd; int target = NUMA_NO_NODE, node; int distance, best_distance; nodemask_t tier_nodes, lower_tier; lockdep_assert_held_once(&memory_tier_lock); if (!node_demotion) return; disable_all_demotion_targets(); for_each_node_state(node, N_MEMORY) { best_distance = -1; nd = &node_demotion[node]; memtier = __node_get_memory_tier(node); if (!memtier || list_is_last(&memtier->list, &memory_tiers)) continue; /* * Get the lower memtier to find the demotion node list. */ memtier = list_next_entry(memtier, list); tier_nodes = get_memtier_nodemask(memtier); /* * find_next_best_node, use 'used' nodemask as a skip list. * Add all memory nodes except the selected memory tier * nodelist to skip list so that we find the best node from the * memtier nodelist. */ nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); /* * Find all the nodes in the memory tier node list of same best distance. * add them to the preferred mask. We randomly select between nodes * in the preferred mask when allocating pages during demotion. */ do { target = find_next_best_node(node, &tier_nodes); if (target == NUMA_NO_NODE) break; distance = node_distance(node, target); if (distance == best_distance || best_distance == -1) { best_distance = distance; node_set(target, nd->preferred); } else { break; } } while (1); } /* * Promotion is allowed from a memory tier to higher * memory tier only if the memory tier doesn't include * compute. We want to skip promotion from a memory tier, * if any node that is part of the memory tier have CPUs. * Once we detect such a memory tier, we consider that tier * as top tiper from which promotion is not allowed. */ list_for_each_entry_reverse(memtier, &memory_tiers, list) { tier_nodes = get_memtier_nodemask(memtier); nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); if (!nodes_empty(tier_nodes)) { /* * abstract distance below the max value of this memtier * is considered toptier. */ top_tier_adistance = memtier->adistance_start + MEMTIER_CHUNK_SIZE - 1; break; } } /* * Now build the lower_tier mask for each node collecting node mask from * all memory tier below it. This allows us to fallback demotion page * allocation to a set of nodes that is closer the above selected * preferred node. */ lower_tier = node_states[N_MEMORY]; list_for_each_entry(memtier, &memory_tiers, list) { /* * Keep removing current tier from lower_tier nodes, * This will remove all nodes in current and above * memory tier from the lower_tier mask. */ tier_nodes = get_memtier_nodemask(memtier); nodes_andnot(lower_tier, lower_tier, tier_nodes); memtier->lower_tier_mask = lower_tier; } dump_demotion_targets(); } #else static inline void establish_demotion_targets(void) {} #endif /* CONFIG_MIGRATION */ static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) { if (!node_memory_types[node].memtype) node_memory_types[node].memtype = memtype; /* * for each device getting added in the same NUMA node * with this specific memtype, bump the map count. We * Only take memtype device reference once, so that * changing a node memtype can be done by droping the * only reference count taken here. */ if (node_memory_types[node].memtype == memtype) { if (!node_memory_types[node].map_count++) kref_get(&memtype->kref); } } static struct memory_tier *set_node_memory_tier(int node) { struct memory_tier *memtier; struct memory_dev_type *memtype = default_dram_type; int adist = MEMTIER_ADISTANCE_DRAM; pg_data_t *pgdat = NODE_DATA(node); lockdep_assert_held_once(&memory_tier_lock); if (!node_state(node, N_MEMORY)) return ERR_PTR(-EINVAL); mt_calc_adistance(node, &adist); if (!node_memory_types[node].memtype) { memtype = mt_find_alloc_memory_type(adist, &default_memory_types); if (IS_ERR(memtype)) { memtype = default_dram_type; pr_info("Failed to allocate a memory type. Fall back.\n"); } } __init_node_memory_type(node, memtype); memtype = node_memory_types[node].memtype; node_set(node, memtype->nodes); memtier = find_create_memory_tier(memtype); if (!IS_ERR(memtier)) rcu_assign_pointer(pgdat->memtier, memtier); return memtier; } static void destroy_memory_tier(struct memory_tier *memtier) { list_del(&memtier->list); device_unregister(&memtier->dev); } static bool clear_node_memory_tier(int node) { bool cleared = false; pg_data_t *pgdat; struct memory_tier *memtier; pgdat = NODE_DATA(node); if (!pgdat) return false; /* * Make sure that anybody looking at NODE_DATA who finds * a valid memtier finds memory_dev_types with nodes still * linked to the memtier. We achieve this by waiting for * rcu read section to finish using synchronize_rcu. * This also enables us to free the destroyed memory tier * with kfree instead of kfree_rcu */ memtier = __node_get_memory_tier(node); if (memtier) { struct memory_dev_type *memtype; rcu_assign_pointer(pgdat->memtier, NULL); synchronize_rcu(); memtype = node_memory_types[node].memtype; node_clear(node, memtype->nodes); if (nodes_empty(memtype->nodes)) { list_del_init(&memtype->tier_sibling); if (list_empty(&memtier->memory_types)) destroy_memory_tier(memtier); } cleared = true; } return cleared; } static void release_memtype(struct kref *kref) { struct memory_dev_type *memtype; memtype = container_of(kref, struct memory_dev_type, kref); kfree(memtype); } struct memory_dev_type *alloc_memory_type(int adistance) { struct memory_dev_type *memtype; memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); if (!memtype) return ERR_PTR(-ENOMEM); memtype->adistance = adistance; INIT_LIST_HEAD(&memtype->tier_sibling); memtype->nodes = NODE_MASK_NONE; kref_init(&memtype->kref); return memtype; } EXPORT_SYMBOL_GPL(alloc_memory_type); void put_memory_type(struct memory_dev_type *memtype) { kref_put(&memtype->kref, release_memtype); } EXPORT_SYMBOL_GPL(put_memory_type); void init_node_memory_type(int node, struct memory_dev_type *memtype) { mutex_lock(&memory_tier_lock); __init_node_memory_type(node, memtype); mutex_unlock(&memory_tier_lock); } EXPORT_SYMBOL_GPL(init_node_memory_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype) { mutex_lock(&memory_tier_lock); if (node_memory_types[node].memtype == memtype || !memtype) node_memory_types[node].map_count--; /* * If we umapped all the attached devices to this node, * clear the node memory type. */ if (!node_memory_types[node].map_count) { memtype = node_memory_types[node].memtype; node_memory_types[node].memtype = NULL; put_memory_type(memtype); } mutex_unlock(&memory_tier_lock); } EXPORT_SYMBOL_GPL(clear_node_memory_type); struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) { struct memory_dev_type *mtype; list_for_each_entry(mtype, memory_types, list) if (mtype->adistance == adist) return mtype; mtype = alloc_memory_type(adist); if (IS_ERR(mtype)) return mtype; list_add(&mtype->list, memory_types); return mtype; } EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); void mt_put_memory_types(struct list_head *memory_types) { struct memory_dev_type *mtype, *mtn; list_for_each_entry_safe(mtype, mtn, memory_types, list) { list_del(&mtype->list); put_memory_type(mtype); } } EXPORT_SYMBOL_GPL(mt_put_memory_types); /* * This is invoked via `late_initcall()` to initialize memory tiers for * memory nodes, both with and without CPUs. After the initialization of * firmware and devices, adistance algorithms are expected to be provided. */ static int __init memory_tier_late_init(void) { int nid; struct memory_tier *memtier; get_online_mems(); guard(mutex)(&memory_tier_lock); /* Assign each uninitialized N_MEMORY node to a memory tier. */ for_each_node_state(nid, N_MEMORY) { /* * Some device drivers may have initialized * memory tiers, potentially bringing memory nodes * online and configuring memory tiers. * Exclude them here. */ if (node_memory_types[nid].memtype) continue; memtier = set_node_memory_tier(nid); if (IS_ERR(memtier)) continue; } establish_demotion_targets(); put_online_mems(); return 0; } late_initcall(memory_tier_late_init); static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) { pr_info( "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", prefix, coord->read_latency, coord->write_latency, coord->read_bandwidth, coord->write_bandwidth); } int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, const char *source) { guard(mutex)(&default_dram_perf_lock); if (default_dram_perf_error) return -EIO; if (perf->read_latency + perf->write_latency == 0 || perf->read_bandwidth + perf->write_bandwidth == 0) return -EINVAL; if (default_dram_perf_ref_nid == NUMA_NO_NODE) { default_dram_perf = *perf; default_dram_perf_ref_nid = nid; default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); return 0; } /* * The performance of all default DRAM nodes is expected to be * same (that is, the variation is less than 10%). And it * will be used as base to calculate the abstract distance of * other memory nodes. */ if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > default_dram_perf.read_latency || abs(perf->write_latency - default_dram_perf.write_latency) * 10 > default_dram_perf.write_latency || abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > default_dram_perf.read_bandwidth || abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > default_dram_perf.write_bandwidth) { pr_info( "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" "DRAM node %d.\n", nid, default_dram_perf_ref_nid); pr_info(" performance of reference DRAM node %d from %s:\n", default_dram_perf_ref_nid, default_dram_perf_ref_source); dump_hmem_attrs(&default_dram_perf, " "); pr_info(" performance of DRAM node %d from %s:\n", nid, source); dump_hmem_attrs(perf, " "); pr_info( " disable default DRAM node performance based abstract distance algorithm.\n"); default_dram_perf_error = true; return -EINVAL; } return 0; } int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) { guard(mutex)(&default_dram_perf_lock); if (default_dram_perf_error) return -EIO; if (perf->read_latency + perf->write_latency == 0 || perf->read_bandwidth + perf->write_bandwidth == 0) return -EINVAL; if (default_dram_perf_ref_nid == NUMA_NO_NODE) return -ENOENT; /* * The abstract distance of a memory node is in direct proportion to * its memory latency (read + write) and inversely proportional to its * memory bandwidth (read + write). The abstract distance, memory * latency, and memory bandwidth of the default DRAM nodes are used as * the base. */ *adist = MEMTIER_ADISTANCE_DRAM * (perf->read_latency + perf->write_latency) / (default_dram_perf.read_latency + default_dram_perf.write_latency) * (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / (perf->read_bandwidth + perf->write_bandwidth); return 0; } EXPORT_SYMBOL_GPL(mt_perf_to_adistance); /** * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm * @nb: The notifier block which describe the algorithm * * Return: 0 on success, errno on error. * * Every memory tiering abstract distance algorithm provider needs to * register the algorithm with register_mt_adistance_algorithm(). To * calculate the abstract distance for a specified memory node, the * notifier function will be called unless some high priority * algorithm has provided result. The prototype of the notifier * function is as follows, * * int (*algorithm_notifier)(struct notifier_block *nb, * unsigned long nid, void *data); * * Where "nid" specifies the memory node, "data" is the pointer to the * returned abstract distance (that is, "int *adist"). If the * algorithm provides the result, NOTIFY_STOP should be returned. * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next * algorithm in the chain to provide the result. */ int register_mt_adistance_algorithm(struct notifier_block *nb) { return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); } EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); /** * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm * @nb: the notifier block which describe the algorithm * * Return: 0 on success, errno on error. */ int unregister_mt_adistance_algorithm(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); } EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); /** * mt_calc_adistance() - Calculate abstract distance with registered algorithms * @node: the node to calculate abstract distance for * @adist: the returned abstract distance * * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some * abstract distance algorithm provides the result, and return it via * @adist. Otherwise, no algorithm can provide the result and @adist * will be kept as it is. */ int mt_calc_adistance(int node, int *adist) { return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); } EXPORT_SYMBOL_GPL(mt_calc_adistance); static int __meminit memtier_hotplug_callback(struct notifier_block *self, unsigned long action, void *_arg) { struct memory_tier *memtier; struct node_notify *nn = _arg; switch (action) { case NODE_REMOVED_LAST_MEMORY: mutex_lock(&memory_tier_lock); if (clear_node_memory_tier(nn->nid)) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); break; case NODE_ADDED_FIRST_MEMORY: mutex_lock(&memory_tier_lock); memtier = set_node_memory_tier(nn->nid); if (!IS_ERR(memtier)) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); break; } return notifier_from_errno(0); } static int __init memory_tier_init(void) { int ret; ret = subsys_virtual_register(&memory_tier_subsys, NULL); if (ret) panic("%s() failed to register memory tier subsystem\n", __func__); #ifdef CONFIG_MIGRATION node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), GFP_KERNEL); WARN_ON(!node_demotion); #endif mutex_lock(&memory_tier_lock); /* * For now we can have 4 faster memory tiers with smaller adistance * than default DRAM tier. */ default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, &default_memory_types); mutex_unlock(&memory_tier_lock); if (IS_ERR(default_dram_type)) panic("%s() failed to allocate default DRAM tier\n", __func__); /* Record nodes with memory and CPU to set default DRAM performance. */ nodes_and(default_dram_nodes, node_states[N_MEMORY], node_states[N_CPU]); hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); return 0; } subsys_initcall(memory_tier_init); bool numa_demotion_enabled = false; #ifdef CONFIG_MIGRATION #ifdef CONFIG_SYSFS static ssize_t demotion_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_true_false(numa_demotion_enabled)); } static ssize_t demotion_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { ssize_t ret; bool before = numa_demotion_enabled; ret = kstrtobool(buf, &numa_demotion_enabled); if (ret) return ret; /* * Reset kswapd_failures statistics. They may no longer be * valid since the policy for kswapd has changed. */ if (before == false && numa_demotion_enabled == true) { struct pglist_data *pgdat; for_each_online_pgdat(pgdat) atomic_set(&pgdat->kswapd_failures, 0); } return count; } static struct kobj_attribute numa_demotion_enabled_attr = __ATTR_RW(demotion_enabled); static struct attribute *numa_attrs[] = { &numa_demotion_enabled_attr.attr, NULL, }; static const struct attribute_group numa_attr_group = { .attrs = numa_attrs, }; static int __init numa_init_sysfs(void) { int err; struct kobject *numa_kobj; numa_kobj = kobject_create_and_add("numa", mm_kobj); if (!numa_kobj) { pr_err("failed to create numa kobject\n"); return -ENOMEM; } err = sysfs_create_group(numa_kobj, &numa_attr_group); if (err) { pr_err("failed to register numa group\n"); goto delete_obj; } return 0; delete_obj: kobject_put(numa_kobj); return err; } subsys_initcall(numa_init_sysfs); #endif /* CONFIG_SYSFS */ #endif
3 3 7 1 1 1 1 2 2 1 1 2 2 2 1 2 2 1 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 // SPDX-License-Identifier: GPL-2.0-or-later /* * vimc-debayer.c Virtual Media Controller Driver * * Copyright (C) 2015-2017 Helen Koike <helen.fornazier@gmail.com> */ #include <linux/moduleparam.h> #include <linux/platform_device.h> #include <linux/vmalloc.h> #include <linux/v4l2-mediabus.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-event.h> #include <media/v4l2-subdev.h> #include "vimc-common.h" /* TODO: Add support for more output formats, we only support RGB888 for now. */ #define VIMC_DEBAYER_SOURCE_MBUS_FMT MEDIA_BUS_FMT_RGB888_1X24 enum vimc_debayer_rgb_colors { VIMC_DEBAYER_RED = 0, VIMC_DEBAYER_GREEN = 1, VIMC_DEBAYER_BLUE = 2, }; struct vimc_debayer_pix_map { u32 code; enum vimc_debayer_rgb_colors order[2][2]; }; struct vimc_debayer_device { struct vimc_ent_device ved; struct v4l2_subdev sd; struct v4l2_ctrl_handler hdl; struct media_pad pads[2]; u8 *src_frame; void (*set_rgb_src)(struct vimc_debayer_device *vdebayer, unsigned int lin, unsigned int col, unsigned int rgb[3]); /* * Virtual "hardware" configuration, filled when the stream starts or * when controls are set. */ struct { const struct vimc_debayer_pix_map *sink_pix_map; unsigned int sink_bpp; struct v4l2_area size; unsigned int mean_win_size; u32 src_code; } hw; }; static const struct v4l2_mbus_framefmt sink_fmt_default = { .width = 640, .height = 480, .code = MEDIA_BUS_FMT_SRGGB8_1X8, .field = V4L2_FIELD_NONE, .colorspace = V4L2_COLORSPACE_SRGB, }; static const u32 vimc_debayer_src_mbus_codes[] = { MEDIA_BUS_FMT_GBR888_1X24, MEDIA_BUS_FMT_BGR888_1X24, MEDIA_BUS_FMT_BGR888_3X8, MEDIA_BUS_FMT_RGB888_1X24, MEDIA_BUS_FMT_RGB888_2X12_BE, MEDIA_BUS_FMT_RGB888_2X12_LE, MEDIA_BUS_FMT_RGB888_3X8, MEDIA_BUS_FMT_RGB888_1X7X4_SPWG, MEDIA_BUS_FMT_RGB888_1X7X4_JEIDA, MEDIA_BUS_FMT_RGB888_1X32_PADHI, }; static const struct vimc_debayer_pix_map vimc_debayer_pix_map_list[] = { { .code = MEDIA_BUS_FMT_SBGGR8_1X8, .order = { { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG8_1X8, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE }, { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG8_1X8, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED }, { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB8_1X8, .order = { { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE } } }, { .code = MEDIA_BUS_FMT_SBGGR10_1X10, .order = { { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG10_1X10, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE }, { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG10_1X10, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED }, { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB10_1X10, .order = { { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE } } }, { .code = MEDIA_BUS_FMT_SBGGR12_1X12, .order = { { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG12_1X12, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE }, { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG12_1X12, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED }, { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB12_1X12, .order = { { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE } } }, }; static const struct vimc_debayer_pix_map *vimc_debayer_pix_map_by_code(u32 code) { unsigned int i; for (i = 0; i < ARRAY_SIZE(vimc_debayer_pix_map_list); i++) if (vimc_debayer_pix_map_list[i].code == code) return &vimc_debayer_pix_map_list[i]; return NULL; } static bool vimc_debayer_src_code_is_valid(u32 code) { unsigned int i; for (i = 0; i < ARRAY_SIZE(vimc_debayer_src_mbus_codes); i++) if (vimc_debayer_src_mbus_codes[i] == code) return true; return false; } static int vimc_debayer_init_state(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state) { struct v4l2_mbus_framefmt *mf; mf = v4l2_subdev_state_get_format(sd_state, 0); *mf = sink_fmt_default; mf = v4l2_subdev_state_get_format(sd_state, 1); *mf = sink_fmt_default; mf->code = VIMC_DEBAYER_SOURCE_MBUS_FMT; return 0; } static int vimc_debayer_enum_mbus_code(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_mbus_code_enum *code) { if (VIMC_IS_SRC(code->pad)) { if (code->index >= ARRAY_SIZE(vimc_debayer_src_mbus_codes)) return -EINVAL; code->code = vimc_debayer_src_mbus_codes[code->index]; } else { if (code->index >= ARRAY_SIZE(vimc_debayer_pix_map_list)) return -EINVAL; code->code = vimc_debayer_pix_map_list[code->index].code; } return 0; } static int vimc_debayer_enum_frame_size(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_frame_size_enum *fse) { if (fse->index) return -EINVAL; if (VIMC_IS_SINK(fse->pad)) { const struct vimc_debayer_pix_map *vpix = vimc_debayer_pix_map_by_code(fse->code); if (!vpix) return -EINVAL; } else if (!vimc_debayer_src_code_is_valid(fse->code)) { return -EINVAL; } fse->min_width = VIMC_FRAME_MIN_WIDTH; fse->max_width = VIMC_FRAME_MAX_WIDTH; fse->min_height = VIMC_FRAME_MIN_HEIGHT; fse->max_height = VIMC_FRAME_MAX_HEIGHT; return 0; } static void vimc_debayer_adjust_sink_fmt(struct v4l2_mbus_framefmt *fmt) { const struct vimc_debayer_pix_map *vpix; /* Don't accept a code that is not on the debayer table */ vpix = vimc_debayer_pix_map_by_code(fmt->code); if (!vpix) fmt->code = sink_fmt_default.code; fmt->width = clamp_t(u32, fmt->width, VIMC_FRAME_MIN_WIDTH, VIMC_FRAME_MAX_WIDTH) & ~1; fmt->height = clamp_t(u32, fmt->height, VIMC_FRAME_MIN_HEIGHT, VIMC_FRAME_MAX_HEIGHT) & ~1; if (fmt->field == V4L2_FIELD_ANY) fmt->field = sink_fmt_default.field; vimc_colorimetry_clamp(fmt); } static int vimc_debayer_set_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_format *fmt) { struct vimc_debayer_device *vdebayer = v4l2_get_subdevdata(sd); struct v4l2_mbus_framefmt *format; /* Do not change the format while stream is on. */ if (fmt->which == V4L2_SUBDEV_FORMAT_ACTIVE && vdebayer->src_frame) return -EBUSY; /* * Do not change the format of the source pad, it is propagated from * the sink. */ if (VIMC_IS_SRC(fmt->pad)) return v4l2_subdev_get_fmt(sd, sd_state, fmt); /* Set the new format in the sink pad. */ vimc_debayer_adjust_sink_fmt(&fmt->format); format = v4l2_subdev_state_get_format(sd_state, 0); dev_dbg(vdebayer->ved.dev, "%s: sink format update: " "old:%dx%d (0x%x, %d, %d, %d, %d) " "new:%dx%d (0x%x, %d, %d, %d, %d)\n", vdebayer->sd.name, /* old */ format->width, format->height, format->code, format->colorspace, format->quantization, format->xfer_func, format->ycbcr_enc, /* new */ fmt->format.width, fmt->format.height, fmt->format.code, fmt->format.colorspace, fmt->format.quantization, fmt->format.xfer_func, fmt->format.ycbcr_enc); *format = fmt->format; /* Propagate the format to the source pad. */ format = v4l2_subdev_state_get_format(sd_state, 1); *format = fmt->format; format->code = VIMC_DEBAYER_SOURCE_MBUS_FMT; return 0; } static const struct v4l2_subdev_pad_ops vimc_debayer_pad_ops = { .enum_mbus_code = vimc_debayer_enum_mbus_code, .enum_frame_size = vimc_debayer_enum_frame_size, .get_fmt = v4l2_subdev_get_fmt, .set_fmt = vimc_debayer_set_fmt, }; static void vimc_debayer_process_rgb_frame(struct vimc_debayer_device *vdebayer, unsigned int lin, unsigned int col, unsigned int rgb[3]) { const struct vimc_pix_map *vpix; unsigned int i, index; vpix = vimc_pix_map_by_code(vdebayer->hw.src_code); index = VIMC_FRAME_INDEX(lin, col, vdebayer->hw.size.width, 3); for (i = 0; i < 3; i++) { switch (vpix->pixelformat) { case V4L2_PIX_FMT_RGB24: vdebayer->src_frame[index + i] = rgb[i]; break; case V4L2_PIX_FMT_BGR24: vdebayer->src_frame[index + i] = rgb[2 - i]; break; } } } static int vimc_debayer_s_stream(struct v4l2_subdev *sd, int enable) { struct vimc_debayer_device *vdebayer = v4l2_get_subdevdata(sd); if (enable) { const struct v4l2_mbus_framefmt *sink_fmt; const struct v4l2_mbus_framefmt *src_fmt; struct v4l2_subdev_state *state; const struct vimc_pix_map *vpix; unsigned int frame_size; if (vdebayer->src_frame) return 0; state = v4l2_subdev_lock_and_get_active_state(sd); sink_fmt = v4l2_subdev_state_get_format(state, 0); src_fmt = v4l2_subdev_state_get_format(state, 1); /* Calculate the frame size of the source pad */ vpix = vimc_pix_map_by_code(src_fmt->code); frame_size = src_fmt->width * src_fmt->height * vpix->bpp; /* Save the bytes per pixel of the sink */ vpix = vimc_pix_map_by_code(sink_fmt->code); vdebayer->hw.sink_bpp = vpix->bpp; /* Get the corresponding pixel map from the table */ vdebayer->hw.sink_pix_map = vimc_debayer_pix_map_by_code(sink_fmt->code); vdebayer->hw.size.width = sink_fmt->width; vdebayer->hw.size.height = sink_fmt->height; vdebayer->hw.src_code = src_fmt->code; v4l2_subdev_unlock_state(state); /* * Allocate the frame buffer. Use vmalloc to be able to * allocate a large amount of memory */ vdebayer->src_frame = vmalloc(frame_size); if (!vdebayer->src_frame) return -ENOMEM; } else { if (!vdebayer->src_frame) return 0; vfree(vdebayer->src_frame); vdebayer->src_frame = NULL; } return 0; } static const struct v4l2_subdev_core_ops vimc_debayer_core_ops = { .log_status = v4l2_ctrl_subdev_log_status, .subscribe_event = v4l2_ctrl_subdev_subscribe_event, .unsubscribe_event = v4l2_event_subdev_unsubscribe, }; static const struct v4l2_subdev_video_ops vimc_debayer_video_ops = { .s_stream = vimc_debayer_s_stream, }; static const struct v4l2_subdev_ops vimc_debayer_ops = { .core = &vimc_debayer_core_ops, .pad = &vimc_debayer_pad_ops, .video = &vimc_debayer_video_ops, }; static const struct v4l2_subdev_internal_ops vimc_debayer_internal_ops = { .init_state = vimc_debayer_init_state, }; static unsigned int vimc_debayer_get_val(const u8 *bytes, const unsigned int n_bytes) { unsigned int i; unsigned int acc = 0; for (i = 0; i < n_bytes; i++) acc = acc + (bytes[i] << (8 * i)); return acc; } static void vimc_debayer_calc_rgb_sink(struct vimc_debayer_device *vdebayer, const u8 *frame, const unsigned int lin, const unsigned int col, unsigned int rgb[3]) { unsigned int i, seek, wlin, wcol; unsigned int n_rgb[3] = {0, 0, 0}; for (i = 0; i < 3; i++) rgb[i] = 0; /* * Calculate how many we need to subtract to get to the pixel in * the top left corner of the mean window (considering the current * pixel as the center) */ seek = vdebayer->hw.mean_win_size / 2; /* Sum the values of the colors in the mean window */ dev_dbg(vdebayer->ved.dev, "deb: %s: --- Calc pixel %dx%d, window mean %d, seek %d ---\n", vdebayer->sd.name, lin, col, vdebayer->hw.size.height, seek); /* * Iterate through all the lines in the mean window, start * with zero if the pixel is outside the frame and don't pass * the height when the pixel is in the bottom border of the * frame */ for (wlin = seek > lin ? 0 : lin - seek; wlin < lin + seek + 1 && wlin < vdebayer->hw.size.height; wlin++) { /* * Iterate through all the columns in the mean window, start * with zero if the pixel is outside the frame and don't pass * the width when the pixel is in the right border of the * frame */ for (wcol = seek > col ? 0 : col - seek; wcol < col + seek + 1 && wcol < vdebayer->hw.size.width; wcol++) { enum vimc_debayer_rgb_colors color; unsigned int index; /* Check which color this pixel is */ color = vdebayer->hw.sink_pix_map->order[wlin % 2][wcol % 2]; index = VIMC_FRAME_INDEX(wlin, wcol, vdebayer->hw.size.width, vdebayer->hw.sink_bpp); dev_dbg(vdebayer->ved.dev, "deb: %s: RGB CALC: frame index %d, win pos %dx%d, color %d\n", vdebayer->sd.name, index, wlin, wcol, color); /* Get its value */ rgb[color] = rgb[color] + vimc_debayer_get_val(&frame[index], vdebayer->hw.sink_bpp); /* Save how many values we already added */ n_rgb[color]++; dev_dbg(vdebayer->ved.dev, "deb: %s: RGB CALC: val %d, n %d\n", vdebayer->sd.name, rgb[color], n_rgb[color]); } } /* Calculate the mean */ for (i = 0; i < 3; i++) { dev_dbg(vdebayer->ved.dev, "deb: %s: PRE CALC: %dx%d Color %d, val %d, n %d\n", vdebayer->sd.name, lin, col, i, rgb[i], n_rgb[i]); if (n_rgb[i]) rgb[i] = rgb[i] / n_rgb[i]; dev_dbg(vdebayer->ved.dev, "deb: %s: FINAL CALC: %dx%d Color %d, val %d\n", vdebayer->sd.name, lin, col, i, rgb[i]); } } static void *vimc_debayer_process_frame(struct vimc_ent_device *ved, const void *sink_frame) { struct vimc_debayer_device *vdebayer = container_of(ved, struct vimc_debayer_device, ved); unsigned int rgb[3]; unsigned int i, j; /* If the stream in this node is not active, just return */ if (!vdebayer->src_frame) return ERR_PTR(-EINVAL); for (i = 0; i < vdebayer->hw.size.height; i++) for (j = 0; j < vdebayer->hw.size.width; j++) { vimc_debayer_calc_rgb_sink(vdebayer, sink_frame, i, j, rgb); vdebayer->set_rgb_src(vdebayer, i, j, rgb); } return vdebayer->src_frame; } static int vimc_debayer_s_ctrl(struct v4l2_ctrl *ctrl) { struct vimc_debayer_device *vdebayer = container_of(ctrl->handler, struct vimc_debayer_device, hdl); switch (ctrl->id) { case VIMC_CID_MEAN_WIN_SIZE: vdebayer->hw.mean_win_size = ctrl->val; break; default: return -EINVAL; } return 0; } static const struct v4l2_ctrl_ops vimc_debayer_ctrl_ops = { .s_ctrl = vimc_debayer_s_ctrl, }; static void vimc_debayer_release(struct vimc_ent_device *ved) { struct vimc_debayer_device *vdebayer = container_of(ved, struct vimc_debayer_device, ved); v4l2_ctrl_handler_free(&vdebayer->hdl); v4l2_subdev_cleanup(&vdebayer->sd); media_entity_cleanup(vdebayer->ved.ent); kfree(vdebayer); } static const struct v4l2_ctrl_config vimc_debayer_ctrl_class = { .flags = V4L2_CTRL_FLAG_READ_ONLY | V4L2_CTRL_FLAG_WRITE_ONLY, .id = VIMC_CID_VIMC_CLASS, .name = "VIMC Controls", .type = V4L2_CTRL_TYPE_CTRL_CLASS, }; static const struct v4l2_ctrl_config vimc_debayer_ctrl_mean_win_size = { .ops = &vimc_debayer_ctrl_ops, .id = VIMC_CID_MEAN_WIN_SIZE, .name = "Debayer Mean Window Size", .type = V4L2_CTRL_TYPE_INTEGER, .min = 1, .max = 25, .step = 2, .def = 3, }; static struct vimc_ent_device *vimc_debayer_add(struct vimc_device *vimc, const char *vcfg_name) { struct v4l2_device *v4l2_dev = &vimc->v4l2_dev; struct vimc_debayer_device *vdebayer; int ret; /* Allocate the vdebayer struct */ vdebayer = kzalloc(sizeof(*vdebayer), GFP_KERNEL); if (!vdebayer) return ERR_PTR(-ENOMEM); /* Create controls: */ v4l2_ctrl_handler_init(&vdebayer->hdl, 2); v4l2_ctrl_new_custom(&vdebayer->hdl, &vimc_debayer_ctrl_class, NULL); v4l2_ctrl_new_custom(&vdebayer->hdl, &vimc_debayer_ctrl_mean_win_size, NULL); vdebayer->sd.ctrl_handler = &vdebayer->hdl; if (vdebayer->hdl.error) { ret = vdebayer->hdl.error; goto err_free_vdebayer; } /* Initialize ved and sd */ vdebayer->pads[0].flags = MEDIA_PAD_FL_SINK; vdebayer->pads[1].flags = MEDIA_PAD_FL_SOURCE; ret = vimc_ent_sd_register(&vdebayer->ved, &vdebayer->sd, v4l2_dev, vcfg_name, MEDIA_ENT_F_PROC_VIDEO_PIXEL_ENC_CONV, 2, vdebayer->pads, &vimc_debayer_internal_ops, &vimc_debayer_ops); if (ret) goto err_free_hdl; vdebayer->ved.process_frame = vimc_debayer_process_frame; vdebayer->ved.dev = vimc->mdev.dev; vdebayer->hw.mean_win_size = vimc_debayer_ctrl_mean_win_size.def; vdebayer->set_rgb_src = vimc_debayer_process_rgb_frame; return &vdebayer->ved; err_free_hdl: v4l2_ctrl_handler_free(&vdebayer->hdl); err_free_vdebayer: kfree(vdebayer); return ERR_PTR(ret); } const struct vimc_ent_type vimc_debayer_type = { .add = vimc_debayer_add, .release = vimc_debayer_release };
59 831 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RATELIMIT_H #define _LINUX_RATELIMIT_H #include <linux/ratelimit_types.h> #include <linux/sched.h> #include <linux/spinlock.h> static inline void ratelimit_state_init(struct ratelimit_state *rs, int interval, int burst) { memset(rs, 0, sizeof(*rs)); raw_spin_lock_init(&rs->lock); rs->interval = interval; rs->burst = burst; } static inline void ratelimit_default_init(struct ratelimit_state *rs) { return ratelimit_state_init(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); } static inline void ratelimit_state_inc_miss(struct ratelimit_state *rs) { atomic_inc(&rs->missed); } static inline int ratelimit_state_get_miss(struct ratelimit_state *rs) { return atomic_read(&rs->missed); } static inline int ratelimit_state_reset_miss(struct ratelimit_state *rs) { return atomic_xchg_relaxed(&rs->missed, 0); } static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, int interval_init) { unsigned long flags; raw_spin_lock_irqsave(&rs->lock, flags); rs->interval = interval_init; rs->flags &= ~RATELIMIT_INITIALIZED; atomic_set(&rs->rs_n_left, rs->burst); ratelimit_state_reset_miss(rs); raw_spin_unlock_irqrestore(&rs->lock, flags); } static inline void ratelimit_state_exit(struct ratelimit_state *rs) { int m; if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) return; m = ratelimit_state_reset_miss(rs); if (m) pr_warn("%s: %d output lines suppressed due to ratelimiting\n", current->comm, m); } static inline void ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags) { rs->flags = flags; } extern struct ratelimit_state printk_ratelimit_state; #ifdef CONFIG_PRINTK #define WARN_ON_RATELIMIT(condition, state) ({ \ bool __rtn_cond = !!(condition); \ WARN_ON(__rtn_cond && __ratelimit(state)); \ __rtn_cond; \ }) #define WARN_RATELIMIT(condition, format, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ int rtn = !!(condition); \ \ if (unlikely(rtn && __ratelimit(&_rs))) \ WARN(rtn, format, ##__VA_ARGS__); \ \ rtn; \ }) #else #define WARN_ON_RATELIMIT(condition, state) \ WARN_ON(condition) #define WARN_RATELIMIT(condition, format, ...) \ ({ \ int rtn = WARN(condition, format, ##__VA_ARGS__); \ rtn; \ }) #endif #endif /* _LINUX_RATELIMIT_H */
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/signalfd.h * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * */ #ifndef _LINUX_SIGNALFD_H #define _LINUX_SIGNALFD_H #include <uapi/linux/signalfd.h> #include <linux/sched/signal.h> #ifdef CONFIG_SIGNALFD /* * Deliver the signal to listening signalfd. */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh))) wake_up(&tsk->sighand->signalfd_wqh); } extern void signalfd_cleanup(struct sighand_struct *sighand); #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } static inline void signalfd_cleanup(struct sighand_struct *sighand) { } #endif /* CONFIG_SIGNALFD */ #endif /* _LINUX_SIGNALFD_H */
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_GETORDER_H #define __ASM_GENERIC_GETORDER_H #ifndef __ASSEMBLY__ #include <linux/compiler.h> #include <linux/log2.h> /** * get_order - Determine the allocation order of a memory size * @size: The size for which to get the order * * Determine the allocation order of a particular sized block of memory. This * is on a logarithmic scale, where: * * 0 -> 2^0 * PAGE_SIZE and below * 1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1 * 2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1 * 3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1 * 4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1 * ... * * The order returned is used to find the smallest allocation granule required * to hold an object of the specified size. * * The result is undefined if the size is 0. */ static __always_inline __attribute_const__ int get_order(unsigned long size) { if (__builtin_constant_p(size)) { if (!size) return BITS_PER_LONG - PAGE_SHIFT; if (size < (1UL << PAGE_SHIFT)) return 0; return ilog2((size) - 1) - PAGE_SHIFT + 1; } size--; size >>= PAGE_SHIFT; #if BITS_PER_LONG == 32 return fls(size); #else return fls64(size); #endif } #endif /* __ASSEMBLY__ */ #endif /* __ASM_GENERIC_GETORDER_H */
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2010 Werner Fink, Jiri Slaby */ #include <linux/console.h> #include <linux/kernel.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/tty_driver.h> /* * This is handler for /proc/consoles */ static int show_console_dev(struct seq_file *m, void *v) { static const struct { short flag; char name; } con_flags[] = { { CON_ENABLED, 'E' }, { CON_CONSDEV, 'C' }, { CON_BOOT, 'B' }, { CON_NBCON, 'N' }, { CON_PRINTBUFFER, 'p' }, { CON_BRL, 'b' }, { CON_ANYTIME, 'a' }, }; char flags[ARRAY_SIZE(con_flags) + 1]; struct console *con = v; unsigned int a; dev_t dev = 0; if (con->device) { const struct tty_driver *driver; int index; /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); driver = con->device(con, &index); console_unlock(); if (driver) { dev = MKDEV(driver->major, driver->minor_start); dev += index; } } for (a = 0; a < ARRAY_SIZE(con_flags); a++) flags[a] = (con->flags & con_flags[a].flag) ? con_flags[a].name : ' '; flags[a] = 0; seq_setwidth(m, 21 - 1); seq_printf(m, "%s%d", con->name, con->index); seq_pad(m, ' '); seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', ((con->flags & CON_NBCON) || con->write) ? 'W' : '-', con->unblank ? 'U' : '-', flags); if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); seq_putc(m, '\n'); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) __acquires(&console_mutex) { struct console *con; loff_t off = 0; /* * Hold the console_list_lock to guarantee safe traversal of the * console list. SRCU cannot be used because there is no * place to store the SRCU cookie. */ console_list_lock(); for_each_console(con) if (off++ == *pos) break; return con; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { struct console *con = v; ++*pos; return hlist_entry_safe(con->node.next, struct console, node); } static void c_stop(struct seq_file *m, void *v) __releases(&console_mutex) { console_list_unlock(); } static const struct seq_operations consoles_op = { .start = c_start, .next = c_next, .stop = c_stop, .show = show_console_dev }; static int __init proc_consoles_init(void) { proc_create_seq("consoles", 0, NULL, &consoles_op); return 0; } fs_initcall(proc_consoles_init);
46 46 45 46 46 45 45 46 48 47 47 47 20 45 46 46 46 46 46 46 12 46 3 46 46 45 47 47 48 21 44 44 43 27 27 27 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 // SPDX-License-Identifier: MIT /* * Copyright 2018 Noralf Trønnes * Copyright (c) 2006-2009 Red Hat Inc. * Copyright (c) 2006-2008 Intel Corporation * Jesse Barnes <jesse.barnes@intel.com> * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> */ #include "drm/drm_modeset_lock.h" #include <linux/export.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/string_helpers.h> #include <drm/drm_atomic.h> #include <drm/drm_client.h> #include <drm/drm_connector.h> #include <drm/drm_crtc.h> #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_encoder.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" #define DRM_CLIENT_MAX_CLONED_CONNECTORS 8 struct drm_client_offset { int x, y; }; int drm_client_modeset_create(struct drm_client_dev *client) { struct drm_device *dev = client->dev; unsigned int num_crtc = dev->mode_config.num_crtc; unsigned int max_connector_count = 1; struct drm_mode_set *modeset; struct drm_crtc *crtc; int i = 0; /* Add terminating zero entry to enable index less iteration */ client->modesets = kcalloc(num_crtc + 1, sizeof(*client->modesets), GFP_KERNEL); if (!client->modesets) return -ENOMEM; mutex_init(&client->modeset_mutex); drm_for_each_crtc(crtc, dev) client->modesets[i++].crtc = crtc; /* Cloning is only supported in the single crtc case. */ if (num_crtc == 1) max_connector_count = DRM_CLIENT_MAX_CLONED_CONNECTORS; for (modeset = client->modesets; modeset->crtc; modeset++) { modeset->connectors = kcalloc(max_connector_count, sizeof(*modeset->connectors), GFP_KERNEL); if (!modeset->connectors) goto err_free; } return 0; err_free: drm_client_modeset_free(client); return -ENOMEM; } static void drm_client_modeset_release(struct drm_client_dev *client) { struct drm_mode_set *modeset; drm_client_for_each_modeset(modeset, client) { int i; drm_mode_destroy(client->dev, modeset->mode); modeset->mode = NULL; modeset->fb = NULL; for (i = 0; i < modeset->num_connectors; i++) { drm_connector_put(modeset->connectors[i]); modeset->connectors[i] = NULL; } modeset->num_connectors = 0; } } void drm_client_modeset_free(struct drm_client_dev *client) { struct drm_mode_set *modeset; mutex_lock(&client->modeset_mutex); drm_client_modeset_release(client); drm_client_for_each_modeset(modeset, client) kfree(modeset->connectors); mutex_unlock(&client->modeset_mutex); mutex_destroy(&client->modeset_mutex); kfree(client->modesets); } static struct drm_mode_set * drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc) { struct drm_mode_set *modeset; drm_client_for_each_modeset(modeset, client) if (modeset->crtc == crtc) return modeset; return NULL; } static const struct drm_display_mode * drm_connector_get_tiled_mode(struct drm_connector *connector) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) return mode; } return NULL; } static const struct drm_display_mode * drm_connector_fallback_non_tiled_mode(struct drm_connector *connector) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) continue; return mode; } return NULL; } static const struct drm_display_mode * drm_connector_preferred_mode(struct drm_connector *connector, int width, int height) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay > width || mode->vdisplay > height) continue; if (mode->type & DRM_MODE_TYPE_PREFERRED) return mode; } return NULL; } static const struct drm_display_mode * drm_connector_first_mode(struct drm_connector *connector) { return list_first_entry_or_null(&connector->modes, struct drm_display_mode, head); } static const struct drm_display_mode * drm_connector_pick_cmdline_mode(struct drm_connector *connector) { const struct drm_cmdline_mode *cmdline_mode; const struct drm_display_mode *mode; bool prefer_non_interlace; /* * Find a user-defined mode. If the user gave us a valid * mode on the kernel command line, it will show up in this * list. */ list_for_each_entry(mode, &connector->modes, head) { if (mode->type & DRM_MODE_TYPE_USERDEF) return mode; } cmdline_mode = &connector->cmdline_mode; if (cmdline_mode->specified == false) return NULL; /* * Attempt to find a matching mode in the list of modes we * have gotten so far. */ prefer_non_interlace = !cmdline_mode->interlace; again: list_for_each_entry(mode, &connector->modes, head) { /* check width/height */ if (mode->hdisplay != cmdline_mode->xres || mode->vdisplay != cmdline_mode->yres) continue; if (cmdline_mode->refresh_specified) { if (drm_mode_vrefresh(mode) != cmdline_mode->refresh) continue; } if (cmdline_mode->interlace) { if (!(mode->flags & DRM_MODE_FLAG_INTERLACE)) continue; } else if (prefer_non_interlace) { if (mode->flags & DRM_MODE_FLAG_INTERLACE) continue; } return mode; } if (prefer_non_interlace) { prefer_non_interlace = false; goto again; } return NULL; } static bool drm_connector_enabled(struct drm_connector *connector, bool strict) { bool enable; if (connector->display_info.non_desktop) return false; if (strict) enable = connector->status == connector_status_connected; else enable = connector->status != connector_status_disconnected; return enable; } static void drm_client_connectors_enabled(struct drm_connector *connectors[], unsigned int connector_count, bool enabled[]) { bool any_enabled = false; struct drm_connector *connector; int i = 0; for (i = 0; i < connector_count; i++) { connector = connectors[i]; enabled[i] = drm_connector_enabled(connector, true); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] enabled? %s\n", connector->base.id, connector->name, connector->display_info.non_desktop ? "non desktop" : str_yes_no(enabled[i])); any_enabled |= enabled[i]; } if (any_enabled) return; for (i = 0; i < connector_count; i++) enabled[i] = drm_connector_enabled(connectors[i], false); } static void mode_replace(struct drm_device *dev, const struct drm_display_mode **dst, const struct drm_display_mode *src) { drm_mode_destroy(dev, (struct drm_display_mode *)*dst); *dst = src ? drm_mode_duplicate(dev, src) : NULL; } static void modes_destroy(struct drm_device *dev, const struct drm_display_mode *modes[], int count) { int i; for (i = 0; i < count; i++) mode_replace(dev, &modes[i], NULL); } static bool drm_client_target_cloned(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { int count, i; bool can_clone = false; struct drm_display_mode *dmt_mode; /* only contemplate cloning in the single crtc case */ if (dev->mode_config.num_crtc > 1) return false; count = 0; for (i = 0; i < connector_count; i++) { if (enabled[i]) count++; } /* only contemplate cloning if more than one connector is enabled */ if (count <= 1) return false; /* check the command line or if nothing common pick 1024x768 */ can_clone = true; for (i = 0; i < connector_count; i++) { int j; if (!enabled[i]) continue; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connectors[i])); if (!modes[i]) { can_clone = false; break; } for (j = 0; j < i; j++) { if (!enabled[j]) continue; if (!drm_mode_match(modes[j], modes[i], DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) can_clone = false; } } if (can_clone) { drm_dbg_kms(dev, "can clone using command line\n"); return true; } /* try and find a 1024x768 mode on each connector */ can_clone = true; dmt_mode = drm_mode_find_dmt(dev, 1024, 768, 60, false); if (!dmt_mode) goto fail; for (i = 0; i < connector_count; i++) { const struct drm_display_mode *mode; if (!enabled[i]) continue; list_for_each_entry(mode, &connectors[i]->modes, head) { if (drm_mode_match(mode, dmt_mode, DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) mode_replace(dev, &modes[i], mode); } if (!modes[i]) can_clone = false; } drm_mode_destroy(dev, dmt_mode); if (can_clone) { drm_dbg_kms(dev, "can clone using 1024x768\n"); return true; } fail: drm_info(dev, "kms: can't enable cloning when we probably wanted to.\n"); return false; } static int drm_client_get_tile_offsets(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], int idx, int h_idx, int v_idx) { int i; int hoffset = 0, voffset = 0; for (i = 0; i < connector_count; i++) { struct drm_connector *connector = connectors[i]; if (!connector->has_tile) continue; if (!modes[i] && (h_idx || v_idx)) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] no modes for connector tiled %d\n", connector->base.id, connector->name, i); continue; } if (connector->tile_h_loc < h_idx) hoffset += modes[i]->hdisplay; if (connector->tile_v_loc < v_idx) voffset += modes[i]->vdisplay; } offsets[idx].x = hoffset; offsets[idx].y = voffset; drm_dbg_kms(dev, "returned %d %d for %d %d\n", hoffset, voffset, h_idx, v_idx); return 0; } static bool drm_client_target_preferred(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { const u64 mask = BIT_ULL(connector_count) - 1; u64 conn_configured = 0; int tile_pass = 0; int num_tiled_conns = 0; int i; for (i = 0; i < connector_count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: for (i = 0; i < connector_count; i++) { struct drm_connector *connector = connectors[i]; const char *mode_type; if (conn_configured & BIT_ULL(i)) continue; if (enabled[i] == false) { conn_configured |= BIT_ULL(i); continue; } /* first pass over all the untiled connectors */ if (tile_pass == 0 && connector->has_tile) continue; if (tile_pass == 1) { if (connector->tile_h_loc != 0 || connector->tile_v_loc != 0) continue; } else { if (connector->tile_h_loc != tile_pass - 1 && connector->tile_v_loc != tile_pass - 1) /* if this tile_pass doesn't cover any of the tiles - keep going */ continue; /* * find the tile offsets for this pass - need to find * all tiles left and above */ drm_client_get_tile_offsets(dev, connectors, connector_count, modes, offsets, i, connector->tile_h_loc, connector->tile_v_loc); } mode_type = "cmdline"; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connector)); if (!modes[i]) { mode_type = "preferred"; mode_replace(dev, &modes[i], drm_connector_preferred_mode(connector, width, height)); } if (!modes[i]) { mode_type = "first"; mode_replace(dev, &modes[i], drm_connector_first_mode(connector)); } /* * In case of tiled mode if all tiles not present fallback to * first available non tiled mode. * After all tiles are present, try to find the tiled mode * for all and if tiled mode not present due to fbcon size * limitations, use first non tiled mode only for * tile 0,0 and set to no mode for all other tiles. */ if (connector->has_tile) { if (num_tiled_conns < connector->num_h_tile * connector->num_v_tile || (connector->tile_h_loc == 0 && connector->tile_v_loc == 0 && !drm_connector_get_tiled_mode(connector))) { mode_type = "non tiled"; mode_replace(dev, &modes[i], drm_connector_fallback_non_tiled_mode(connector)); } else { mode_type = "tiled"; mode_replace(dev, &modes[i], drm_connector_get_tiled_mode(connector)); } } if (modes[i]) drm_dbg_kms(dev, "[CONNECTOR:%d:%s] found %s mode: %s\n", connector->base.id, connector->name, mode_type, modes[i]->name); else drm_dbg_kms(dev, "[CONNECTOR:%d:%s] no mode found\n", connector->base.id, connector->name); conn_configured |= BIT_ULL(i); } if ((conn_configured & mask) != mask) { tile_pass++; goto retry; } return true; } static bool connector_has_possible_crtc(struct drm_connector *connector, struct drm_crtc *crtc) { struct drm_encoder *encoder; drm_connector_for_each_possible_encoder(connector, encoder) { if (encoder->possible_crtcs & drm_crtc_mask(crtc)) return true; } return false; } static int drm_client_pick_crtcs(struct drm_client_dev *client, struct drm_connector *connectors[], unsigned int connector_count, struct drm_crtc *best_crtcs[], const struct drm_display_mode *modes[], int n, int width, int height) { struct drm_device *dev = client->dev; struct drm_connector *connector; int my_score, best_score, score; struct drm_crtc **crtcs; struct drm_mode_set *modeset; if (n == connector_count) return 0; connector = connectors[n]; best_crtcs[n] = NULL; best_score = drm_client_pick_crtcs(client, connectors, connector_count, best_crtcs, modes, n + 1, width, height); if (modes[n] == NULL) return best_score; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); if (!crtcs) return best_score; my_score = 1; if (connector->status == connector_status_connected) my_score++; if (connector->cmdline_mode.specified) my_score++; if (drm_connector_preferred_mode(connector, width, height)) my_score++; /* * select a crtc for this connector and then attempt to configure * remaining connectors */ drm_client_for_each_modeset(modeset, client) { struct drm_crtc *crtc = modeset->crtc; int o; if (!connector_has_possible_crtc(connector, crtc)) continue; for (o = 0; o < n; o++) if (best_crtcs[o] == crtc) break; if (o < n) { /* ignore cloning unless only a single crtc */ if (dev->mode_config.num_crtc > 1) continue; if (!drm_mode_equal(modes[o], modes[n])) continue; } crtcs[n] = crtc; memcpy(crtcs, best_crtcs, n * sizeof(*crtcs)); score = my_score + drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, n + 1, width, height); if (score > best_score) { best_score = score; memcpy(best_crtcs, crtcs, connector_count * sizeof(*crtcs)); } } kfree(crtcs); return best_score; } /* Try to read the BIOS display configuration and use it for the initial config */ static bool drm_client_firmware_config(struct drm_client_dev *client, struct drm_connector *connectors[], unsigned int connector_count, struct drm_crtc *crtcs[], const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { const int count = min_t(unsigned int, connector_count, BITS_PER_LONG); unsigned long conn_configured, conn_seq, mask; struct drm_device *dev = client->dev; int i; bool *save_enabled; bool fallback = true, ret = true; int num_connectors_enabled = 0; int num_connectors_detected = 0; int num_tiled_conns = 0; struct drm_modeset_acquire_ctx ctx; if (!drm_drv_uses_atomic_modeset(dev)) return false; if (drm_WARN_ON(dev, count <= 0)) return false; save_enabled = kcalloc(count, sizeof(bool), GFP_KERNEL); if (!save_enabled) return false; drm_modeset_acquire_init(&ctx, 0); while (drm_modeset_lock_all_ctx(dev, &ctx) != 0) drm_modeset_backoff(&ctx); memcpy(save_enabled, enabled, count); mask = GENMASK(count - 1, 0); conn_configured = 0; for (i = 0; i < count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: conn_seq = conn_configured; for (i = 0; i < count; i++) { struct drm_connector *connector = connectors[i]; struct drm_encoder *encoder; struct drm_crtc *crtc; const char *mode_type; int j; if (conn_configured & BIT(i)) continue; if (conn_seq == 0 && !connector->has_tile) continue; if (connector->status == connector_status_connected) num_connectors_detected++; if (!enabled[i]) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] not enabled, skipping\n", connector->base.id, connector->name); conn_configured |= BIT(i); continue; } if (connector->force == DRM_FORCE_OFF) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] disabled by user, skipping\n", connector->base.id, connector->name); enabled[i] = false; continue; } encoder = connector->state->best_encoder; if (!encoder || drm_WARN_ON(dev, !connector->state->crtc)) { if (connector->force > DRM_FORCE_OFF) goto bail; drm_dbg_kms(dev, "[CONNECTOR:%d:%s] has no encoder or crtc, skipping\n", connector->base.id, connector->name); enabled[i] = false; conn_configured |= BIT(i); continue; } num_connectors_enabled++; crtc = connector->state->crtc; /* * Make sure we're not trying to drive multiple connectors * with a single CRTC, since our cloning support may not * match the BIOS. */ for (j = 0; j < count; j++) { if (crtcs[j] == crtc) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] fallback: cloned configuration\n", connector->base.id, connector->name); goto bail; } } mode_type = "cmdline"; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connector)); if (!modes[i]) { mode_type = "preferred"; mode_replace(dev, &modes[i], drm_connector_preferred_mode(connector, width, height)); } if (!modes[i]) { mode_type = "first"; mode_replace(dev, &modes[i], drm_connector_first_mode(connector)); } /* last resort: use current mode */ if (!modes[i]) { mode_type = "current"; mode_replace(dev, &modes[i], &crtc->state->mode); } /* * In case of tiled modes, if all tiles are not present * then fallback to a non tiled mode. */ if (connector->has_tile && num_tiled_conns < connector->num_h_tile * connector->num_v_tile) { mode_type = "non tiled"; mode_replace(dev, &modes[i], drm_connector_fallback_non_tiled_mode(connector)); } crtcs[i] = crtc; drm_dbg_kms(dev, "[CONNECTOR::%d:%s] on [CRTC:%d:%s] using %s mode: %s\n", connector->base.id, connector->name, crtc->base.id, crtc->name, mode_type, modes[i]->name); fallback = false; conn_configured |= BIT(i); } if ((conn_configured & mask) != mask && conn_configured != conn_seq) goto retry; for (i = 0; i < count; i++) { struct drm_connector *connector = connectors[i]; if (connector->has_tile) drm_client_get_tile_offsets(dev, connectors, connector_count, modes, offsets, i, connector->tile_h_loc, connector->tile_v_loc); } /* * If the BIOS didn't enable everything it could, fall back to have the * same user experiencing of lighting up as much as possible like the * fbdev helper library. */ if (num_connectors_enabled != num_connectors_detected && num_connectors_enabled < dev->mode_config.num_crtc) { drm_dbg_kms(dev, "fallback: Not all outputs enabled\n"); drm_dbg_kms(dev, "Enabled: %i, detected: %i\n", num_connectors_enabled, num_connectors_detected); fallback = true; } if (fallback) { bail: drm_dbg_kms(dev, "Not using firmware configuration\n"); memcpy(enabled, save_enabled, count); ret = false; } drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); kfree(save_enabled); return ret; } /** * drm_client_modeset_probe() - Probe for displays * @client: DRM client * @width: Maximum display mode width (optional) * @height: Maximum display mode height (optional) * * This function sets up display pipelines for enabled connectors and stores the * config in the client's modeset array. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int height) { struct drm_connector *connector, **connectors = NULL; struct drm_connector_list_iter conn_iter; struct drm_device *dev = client->dev; unsigned int total_modes_count = 0; struct drm_client_offset *offsets; unsigned int connector_count = 0; const struct drm_display_mode **modes; struct drm_crtc **crtcs; int i, ret = 0; bool *enabled; drm_dbg_kms(dev, "\n"); if (!width) width = dev->mode_config.max_width; if (!height) height = dev->mode_config.max_height; drm_connector_list_iter_begin(dev, &conn_iter); drm_client_for_each_connector_iter(connector, &conn_iter) { struct drm_connector **tmp; tmp = krealloc(connectors, (connector_count + 1) * sizeof(*connectors), GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto free_connectors; } connectors = tmp; drm_connector_get(connector); connectors[connector_count++] = connector; } drm_connector_list_iter_end(&conn_iter); if (!connector_count) return 0; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); modes = kcalloc(connector_count, sizeof(*modes), GFP_KERNEL); offsets = kcalloc(connector_count, sizeof(*offsets), GFP_KERNEL); enabled = kcalloc(connector_count, sizeof(bool), GFP_KERNEL); if (!crtcs || !modes || !enabled || !offsets) { ret = -ENOMEM; goto out; } mutex_lock(&client->modeset_mutex); mutex_lock(&dev->mode_config.mutex); for (i = 0; i < connector_count; i++) total_modes_count += connectors[i]->funcs->fill_modes(connectors[i], width, height); if (!total_modes_count) drm_dbg_kms(dev, "No connectors reported connected with modes\n"); drm_client_connectors_enabled(connectors, connector_count, enabled); if (!drm_client_firmware_config(client, connectors, connector_count, crtcs, modes, offsets, enabled, width, height)) { modes_destroy(dev, modes, connector_count); memset(crtcs, 0, connector_count * sizeof(*crtcs)); memset(offsets, 0, connector_count * sizeof(*offsets)); if (!drm_client_target_cloned(dev, connectors, connector_count, modes, offsets, enabled, width, height) && !drm_client_target_preferred(dev, connectors, connector_count, modes, offsets, enabled, width, height)) drm_err(dev, "Unable to find initial modes\n"); drm_dbg_kms(dev, "picking CRTCs for %dx%d config\n", width, height); drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, 0, width, height); } mutex_unlock(&dev->mode_config.mutex); drm_client_modeset_release(client); for (i = 0; i < connector_count; i++) { const struct drm_display_mode *mode = modes[i]; struct drm_crtc *crtc = crtcs[i]; struct drm_client_offset *offset = &offsets[i]; if (mode && crtc) { struct drm_mode_set *modeset = drm_client_find_modeset(client, crtc); struct drm_connector *connector = connectors[i]; drm_dbg_kms(dev, "[CRTC:%d:%s] desired mode %s set (%d,%d)\n", crtc->base.id, crtc->name, mode->name, offset->x, offset->y); if (drm_WARN_ON_ONCE(dev, modeset->num_connectors == DRM_CLIENT_MAX_CLONED_CONNECTORS || (dev->mode_config.num_crtc > 1 && modeset->num_connectors == 1))) { ret = -EINVAL; break; } drm_mode_destroy(dev, modeset->mode); modeset->mode = drm_mode_duplicate(dev, mode); if (!modeset->mode) { ret = -ENOMEM; break; } drm_connector_get(connector); modeset->connectors[modeset->num_connectors++] = connector; modeset->x = offset->x; modeset->y = offset->y; } } mutex_unlock(&client->modeset_mutex); out: kfree(crtcs); modes_destroy(dev, modes, connector_count); kfree(modes); kfree(offsets); kfree(enabled); free_connectors: for (i = 0; i < connector_count; i++) drm_connector_put(connectors[i]); kfree(connectors); return ret; } EXPORT_SYMBOL(drm_client_modeset_probe); /** * drm_client_rotation() - Check the initial rotation value * @modeset: DRM modeset * @rotation: Returned rotation value * * This function checks if the primary plane in @modeset can hw rotate * to match the rotation needed on its connector. * * Note: Currently only 0 and 180 degrees are supported. * * Return: * True if the plane can do the rotation, false otherwise. */ bool drm_client_rotation(struct drm_mode_set *modeset, unsigned int *rotation) { struct drm_connector *connector = modeset->connectors[0]; struct drm_plane *plane = modeset->crtc->primary; struct drm_cmdline_mode *cmdline; u64 valid_mask = 0; int i; if (!modeset->num_connectors) return false; switch (connector->display_info.panel_orientation) { case DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP: *rotation = DRM_MODE_ROTATE_180; break; case DRM_MODE_PANEL_ORIENTATION_LEFT_UP: *rotation = DRM_MODE_ROTATE_90; break; case DRM_MODE_PANEL_ORIENTATION_RIGHT_UP: *rotation = DRM_MODE_ROTATE_270; break; default: *rotation = DRM_MODE_ROTATE_0; } /** * The panel already defined the default rotation * through its orientation. Whatever has been provided * on the command line needs to be added to that. * * Unfortunately, the rotations are at different bit * indices, so the math to add them up are not as * trivial as they could. * * Reflections on the other hand are pretty trivial to deal with, a * simple XOR between the two handle the addition nicely. */ cmdline = &connector->cmdline_mode; if (cmdline->specified && cmdline->rotation_reflection) { unsigned int cmdline_rest, panel_rest; unsigned int cmdline_rot, panel_rot; unsigned int sum_rot, sum_rest; panel_rot = ilog2(*rotation & DRM_MODE_ROTATE_MASK); cmdline_rot = ilog2(cmdline->rotation_reflection & DRM_MODE_ROTATE_MASK); sum_rot = (panel_rot + cmdline_rot) % 4; panel_rest = *rotation & ~DRM_MODE_ROTATE_MASK; cmdline_rest = cmdline->rotation_reflection & ~DRM_MODE_ROTATE_MASK; sum_rest = panel_rest ^ cmdline_rest; *rotation = (1 << sum_rot) | sum_rest; } /* * TODO: support 90 / 270 degree hardware rotation, * depending on the hardware this may require the framebuffer * to be in a specific tiling format. */ if (((*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_0 && (*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_180) || !plane->rotation_property) return false; for (i = 0; i < plane->rotation_property->num_values; i++) valid_mask |= (1ULL << plane->rotation_property->values[i]); if (!(*rotation & valid_mask)) return false; return true; } EXPORT_SYMBOL(drm_client_rotation); static int drm_client_modeset_commit_atomic(struct drm_client_dev *client, bool active, bool check) { struct drm_device *dev = client->dev; struct drm_plane *plane; struct drm_atomic_state *state; struct drm_modeset_acquire_ctx ctx; struct drm_mode_set *mode_set; int ret; drm_modeset_acquire_init(&ctx, 0); state = drm_atomic_state_alloc(dev); if (!state) { ret = -ENOMEM; goto out_ctx; } state->acquire_ctx = &ctx; retry: drm_for_each_plane(plane, dev) { struct drm_plane_state *plane_state; plane_state = drm_atomic_get_plane_state(state, plane); if (IS_ERR(plane_state)) { ret = PTR_ERR(plane_state); goto out_state; } plane_state->rotation = DRM_MODE_ROTATE_0; /* disable non-primary: */ if (plane->type == DRM_PLANE_TYPE_PRIMARY) continue; ret = __drm_atomic_helper_disable_plane(plane, plane_state); if (ret != 0) goto out_state; } drm_client_for_each_modeset(mode_set, client) { struct drm_plane *primary = mode_set->crtc->primary; unsigned int rotation; if (drm_client_rotation(mode_set, &rotation)) { struct drm_plane_state *plane_state; /* Cannot fail as we've already gotten the plane state above */ plane_state = drm_atomic_get_new_plane_state(state, primary); plane_state->rotation = rotation; } ret = __drm_atomic_helper_set_config(mode_set, state); if (ret != 0) goto out_state; /* * __drm_atomic_helper_set_config() sets active when a * mode is set, unconditionally clear it if we force DPMS off */ if (!active) { struct drm_crtc *crtc = mode_set->crtc; struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc); crtc_state->active = false; } } if (check) ret = drm_atomic_check_only(state); else ret = drm_atomic_commit(state); out_state: if (ret == -EDEADLK) goto backoff; drm_atomic_state_put(state); out_ctx: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); return ret; backoff: drm_atomic_state_clear(state); drm_modeset_backoff(&ctx); goto retry; } static int drm_client_modeset_commit_legacy(struct drm_client_dev *client) { struct drm_device *dev = client->dev; struct drm_mode_set *mode_set; struct drm_plane *plane; int ret = 0; drm_modeset_lock_all(dev); drm_for_each_plane(plane, dev) { if (plane->type != DRM_PLANE_TYPE_PRIMARY) drm_plane_force_disable(plane); if (plane->rotation_property) drm_mode_plane_set_obj_prop(plane, plane->rotation_property, DRM_MODE_ROTATE_0); } drm_client_for_each_modeset(mode_set, client) { struct drm_crtc *crtc = mode_set->crtc; if (crtc->funcs->cursor_set2) { ret = crtc->funcs->cursor_set2(crtc, NULL, 0, 0, 0, 0, 0); if (ret) goto out; } else if (crtc->funcs->cursor_set) { ret = crtc->funcs->cursor_set(crtc, NULL, 0, 0, 0); if (ret) goto out; } ret = drm_mode_set_config_internal(mode_set); if (ret) goto out; } out: drm_modeset_unlock_all(dev); return ret; } /** * drm_client_modeset_check() - Check modeset configuration * @client: DRM client * * Check modeset configuration. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_check(struct drm_client_dev *client) { int ret; if (!drm_drv_uses_atomic_modeset(client->dev)) return 0; mutex_lock(&client->modeset_mutex); ret = drm_client_modeset_commit_atomic(client, true, true); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_check); /** * drm_client_modeset_commit_locked() - Force commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs without checking if there is a DRM * master. The assumption is that the caller already holds an internal DRM * master reference acquired with drm_master_internal_acquire(). * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit_locked(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, true, false); else ret = drm_client_modeset_commit_legacy(client); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit_locked); /** * drm_client_modeset_commit() - Commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; if (!drm_master_internal_acquire(dev)) return -EBUSY; ret = drm_client_modeset_commit_locked(client); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit); static void drm_client_modeset_dpms_legacy(struct drm_client_dev *client, int dpms_mode) { struct drm_device *dev = client->dev; struct drm_connector *connector; struct drm_mode_set *modeset; struct drm_modeset_acquire_ctx ctx; int ret; DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, 0, ret); drm_client_for_each_modeset(modeset, client) { int j; if (!modeset->crtc->enabled) continue; for (j = 0; j < modeset->num_connectors; j++) { connector = modeset->connectors[j]; connector->funcs->dpms(connector, dpms_mode); drm_object_property_set_value(&connector->base, dev->mode_config.dpms_property, dpms_mode); } } DRM_MODESET_LOCK_ALL_END(dev, ctx, ret); } /** * drm_client_modeset_dpms() - Set DPMS mode * @client: DRM client * @mode: DPMS mode * * Note: For atomic drivers @mode is reduced to on/off. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_dpms(struct drm_client_dev *client, int mode) { struct drm_device *dev = client->dev; int ret = 0; if (!drm_master_internal_acquire(dev)) return -EBUSY; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, mode == DRM_MODE_DPMS_ON, false); else drm_client_modeset_dpms_legacy(client, mode); mutex_unlock(&client->modeset_mutex); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_dpms); #ifdef CONFIG_DRM_KUNIT_TEST #include "tests/drm_client_modeset_test.c" #endif
10 10 5 5 10 10 10 10 10 10 5 5 5 5 5 5 5 5 2 5 5 5 5 797 799 121 10 10 5 5 4 4 4 3 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 // SPDX-License-Identifier: GPL-2.0-or-later /* * LAPB release 002 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * History * LAPB 001 Jonathan Naylor Started Coding * LAPB 002 Jonathan Naylor New timer architecture. * 2000-10-29 Henner Eisen lapb_data_indication() return status. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/stat.h> #include <linux/init.h> #include <net/lapb.h> static LIST_HEAD(lapb_list); static DEFINE_RWLOCK(lapb_list_lock); /* * Free an allocated lapb control block. */ static void lapb_free_cb(struct lapb_cb *lapb) { kfree(lapb); } static __inline__ void lapb_hold(struct lapb_cb *lapb) { refcount_inc(&lapb->refcnt); } static __inline__ void lapb_put(struct lapb_cb *lapb) { if (refcount_dec_and_test(&lapb->refcnt)) lapb_free_cb(lapb); } /* * Socket removal during an interrupt is now safe. */ static void __lapb_remove_cb(struct lapb_cb *lapb) { if (lapb->node.next) { list_del(&lapb->node); lapb_put(lapb); } } /* * Add a socket to the bound sockets list. */ static void __lapb_insert_cb(struct lapb_cb *lapb) { list_add(&lapb->node, &lapb_list); lapb_hold(lapb); } static struct lapb_cb *__lapb_devtostruct(struct net_device *dev) { struct lapb_cb *lapb, *use = NULL; list_for_each_entry(lapb, &lapb_list, node) { if (lapb->dev == dev) { use = lapb; break; } } if (use) lapb_hold(use); return use; } static struct lapb_cb *lapb_devtostruct(struct net_device *dev) { struct lapb_cb *rc; read_lock_bh(&lapb_list_lock); rc = __lapb_devtostruct(dev); read_unlock_bh(&lapb_list_lock); return rc; } /* * Create an empty LAPB control block. */ static struct lapb_cb *lapb_create_cb(void) { struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC); if (!lapb) goto out; skb_queue_head_init(&lapb->write_queue); skb_queue_head_init(&lapb->ack_queue); timer_setup(&lapb->t1timer, NULL, 0); timer_setup(&lapb->t2timer, NULL, 0); lapb->t1timer_running = false; lapb->t2timer_running = false; lapb->t1 = LAPB_DEFAULT_T1; lapb->t2 = LAPB_DEFAULT_T2; lapb->n2 = LAPB_DEFAULT_N2; lapb->mode = LAPB_DEFAULT_MODE; lapb->window = LAPB_DEFAULT_WINDOW; lapb->state = LAPB_STATE_0; spin_lock_init(&lapb->lock); refcount_set(&lapb->refcnt, 1); out: return lapb; } int lapb_register(struct net_device *dev, const struct lapb_register_struct *callbacks) { struct lapb_cb *lapb; int rc = LAPB_BADTOKEN; write_lock_bh(&lapb_list_lock); lapb = __lapb_devtostruct(dev); if (lapb) { lapb_put(lapb); goto out; } lapb = lapb_create_cb(); rc = LAPB_NOMEM; if (!lapb) goto out; lapb->dev = dev; lapb->callbacks = callbacks; __lapb_insert_cb(lapb); lapb_start_t1timer(lapb); rc = LAPB_OK; out: write_unlock_bh(&lapb_list_lock); return rc; } EXPORT_SYMBOL(lapb_register); int lapb_unregister(struct net_device *dev) { struct lapb_cb *lapb; int rc = LAPB_BADTOKEN; write_lock_bh(&lapb_list_lock); lapb = __lapb_devtostruct(dev); if (!lapb) goto out; lapb_put(lapb); /* Wait for other refs to "lapb" to drop */ while (refcount_read(&lapb->refcnt) > 2) usleep_range(1, 10); spin_lock_bh(&lapb->lock); lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); lapb_clear_queues(lapb); spin_unlock_bh(&lapb->lock); /* Wait for running timers to stop */ timer_delete_sync(&lapb->t1timer); timer_delete_sync(&lapb->t2timer); __lapb_remove_cb(lapb); lapb_put(lapb); rc = LAPB_OK; out: write_unlock_bh(&lapb_list_lock); return rc; } EXPORT_SYMBOL(lapb_unregister); int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms) { int rc = LAPB_BADTOKEN; struct lapb_cb *lapb = lapb_devtostruct(dev); if (!lapb) goto out; spin_lock_bh(&lapb->lock); parms->t1 = lapb->t1 / HZ; parms->t2 = lapb->t2 / HZ; parms->n2 = lapb->n2; parms->n2count = lapb->n2count; parms->state = lapb->state; parms->window = lapb->window; parms->mode = lapb->mode; if (!timer_pending(&lapb->t1timer)) parms->t1timer = 0; else parms->t1timer = (lapb->t1timer.expires - jiffies) / HZ; if (!timer_pending(&lapb->t2timer)) parms->t2timer = 0; else parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ; spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; out: return rc; } EXPORT_SYMBOL(lapb_getparms); int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms) { int rc = LAPB_BADTOKEN; struct lapb_cb *lapb = lapb_devtostruct(dev); if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_INVALUE; if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1) goto out_put; if (lapb->state == LAPB_STATE_0) { if (parms->mode & LAPB_EXTENDED) { if (parms->window < 1 || parms->window > 127) goto out_put; } else { if (parms->window < 1 || parms->window > 7) goto out_put; } lapb->mode = parms->mode; lapb->window = parms->window; } lapb->t1 = parms->t1 * HZ; lapb->t2 = parms->t2 * HZ; lapb->n2 = parms->n2; rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_setparms); int lapb_connect_request(struct net_device *dev) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_OK; if (lapb->state == LAPB_STATE_1) goto out_put; rc = LAPB_CONNECTED; if (lapb->state == LAPB_STATE_3 || lapb->state == LAPB_STATE_4) goto out_put; lapb_establish_data_link(lapb); lapb_dbg(0, "(%p) S0 -> S1\n", lapb->dev); lapb->state = LAPB_STATE_1; rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_connect_request); static int __lapb_disconnect_request(struct lapb_cb *lapb) { switch (lapb->state) { case LAPB_STATE_0: return LAPB_NOTCONNECTED; case LAPB_STATE_1: lapb_dbg(1, "(%p) S1 TX DISC(1)\n", lapb->dev); lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev); lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); lapb->state = LAPB_STATE_0; lapb_start_t1timer(lapb); return LAPB_NOTCONNECTED; case LAPB_STATE_2: return LAPB_OK; } lapb_clear_queues(lapb); lapb->n2count = 0; lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); lapb_start_t1timer(lapb); lapb_stop_t2timer(lapb); lapb->state = LAPB_STATE_2; lapb_dbg(1, "(%p) S3 DISC(1)\n", lapb->dev); lapb_dbg(0, "(%p) S3 -> S2\n", lapb->dev); return LAPB_OK; } int lapb_disconnect_request(struct net_device *dev) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = __lapb_disconnect_request(lapb); spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_disconnect_request); int lapb_data_request(struct net_device *dev, struct sk_buff *skb) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_NOTCONNECTED; if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4) goto out_put; skb_queue_tail(&lapb->write_queue, skb); lapb_kick(lapb); rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_data_request); int lapb_data_received(struct net_device *dev, struct sk_buff *skb) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (lapb) { spin_lock_bh(&lapb->lock); lapb_data_input(lapb, skb); spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; } return rc; } EXPORT_SYMBOL(lapb_data_received); void lapb_connect_confirmation(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->connect_confirmation) lapb->callbacks->connect_confirmation(lapb->dev, reason); } void lapb_connect_indication(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->connect_indication) lapb->callbacks->connect_indication(lapb->dev, reason); } void lapb_disconnect_confirmation(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->disconnect_confirmation) lapb->callbacks->disconnect_confirmation(lapb->dev, reason); } void lapb_disconnect_indication(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->disconnect_indication) lapb->callbacks->disconnect_indication(lapb->dev, reason); } int lapb_data_indication(struct lapb_cb *lapb, struct sk_buff *skb) { if (lapb->callbacks->data_indication) return lapb->callbacks->data_indication(lapb->dev, skb); kfree_skb(skb); return NET_RX_SUCCESS; /* For now; must be != NET_RX_DROP */ } int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb) { int used = 0; if (lapb->callbacks->data_transmit) { lapb->callbacks->data_transmit(lapb->dev, skb); used = 1; } return used; } /* Handle device status changes. */ static int lapb_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct lapb_cb *lapb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type != ARPHRD_X25) return NOTIFY_DONE; lapb = lapb_devtostruct(dev); if (!lapb) return NOTIFY_DONE; spin_lock_bh(&lapb->lock); switch (event) { case NETDEV_UP: lapb_dbg(0, "(%p) Interface up: %s\n", dev, dev->name); if (netif_carrier_ok(dev)) { lapb_dbg(0, "(%p): Carrier is already up: %s\n", dev, dev->name); if (lapb->mode & LAPB_DCE) { lapb_start_t1timer(lapb); } else { if (lapb->state == LAPB_STATE_0) { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } } } break; case NETDEV_GOING_DOWN: if (netif_carrier_ok(dev)) __lapb_disconnect_request(lapb); break; case NETDEV_DOWN: lapb_dbg(0, "(%p) Interface down: %s\n", dev, dev->name); lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb->n2count = 0; lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); break; case NETDEV_CHANGE: if (netif_carrier_ok(dev)) { lapb_dbg(0, "(%p): Carrier detected: %s\n", dev, dev->name); if (lapb->mode & LAPB_DCE) { lapb_start_t1timer(lapb); } else { if (lapb->state == LAPB_STATE_0) { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } } } else { lapb_dbg(0, "(%p) Carrier lost: %s\n", dev, dev->name); lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb->n2count = 0; lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); } break; } spin_unlock_bh(&lapb->lock); lapb_put(lapb); return NOTIFY_DONE; } static struct notifier_block lapb_dev_notifier = { .notifier_call = lapb_device_event, }; static int __init lapb_init(void) { return register_netdevice_notifier(&lapb_dev_notifier); } static void __exit lapb_exit(void) { WARN_ON(!list_empty(&lapb_list)); unregister_netdevice_notifier(&lapb_dev_notifier); } MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Link Access Procedure B link layer protocol"); MODULE_LICENSE("GPL"); module_init(lapb_init); module_exit(lapb_exit);
3 3 3 2 2 3 2 2 2 1 1 2 1 1 1 1 1 10 10 3 2 2 1 2 1 10 10 3 3 3 3 3 21 21 21 1 1 1 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing */ #include "multicast.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/icmpv6.h> #include <linux/if_bridge.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jiffies.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/addrconf.h> #include <net/genetlink.h> #include <net/if_inet6.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" static void batadv_mcast_mla_update(struct work_struct *work); /** * batadv_mcast_start_timer() - schedule the multicast periodic worker * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv) { queue_delayed_work(batadv_event_workqueue, &bat_priv->mcast.work, msecs_to_jiffies(BATADV_MCAST_WORK_PERIOD)); } /** * batadv_mcast_get_bridge() - get the bridge on top of the meshif if it exists * @mesh_iface: netdev struct of the mesh interface * * If the given mesh interface has a bridge on top then the refcount * of the according net device is increased. * * Return: NULL if no such bridge exists. Otherwise the net device of the * bridge. */ static struct net_device *batadv_mcast_get_bridge(struct net_device *mesh_iface) { struct net_device *upper = mesh_iface; rcu_read_lock(); do { upper = netdev_master_upper_dev_get_rcu(upper); } while (upper && !netif_is_bridge_master(upper)); dev_hold(upper); rcu_read_unlock(); return upper; } /** * batadv_mcast_mla_rtr_flags_meshif_get_ipv4() - get mcast router flags from * node for IPv4 * @dev: the interface to check * * Checks the presence of an IPv4 multicast router on this node. * * Caller needs to hold rcu read lock. * * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise. */ static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv4(struct net_device *dev) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_MFORWARD(in_dev)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR4; } /** * batadv_mcast_mla_rtr_flags_meshif_get_ipv6() - get mcast router flags from * node for IPv6 * @dev: the interface to check * * Checks the presence of an IPv6 multicast router on this node. * * Caller needs to hold rcu read lock. * * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise. */ #if IS_ENABLED(CONFIG_IPV6_MROUTE) static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev) { struct inet6_dev *in6_dev = __in6_dev_get(dev); if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR6; } #else static inline u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev) { return BATADV_MCAST_WANT_NO_RTR6; } #endif /** * batadv_mcast_mla_rtr_flags_meshif_get() - get mcast router flags from node * @bat_priv: the bat priv with all the mesh interface information * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this * node. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_meshif_get(struct batadv_priv *bat_priv, struct net_device *bridge) { struct net_device *dev = bridge ? bridge : bat_priv->mesh_iface; u8 flags = BATADV_NO_FLAGS; rcu_read_lock(); flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv4(dev); flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv6(dev); rcu_read_unlock(); return flags; } /** * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge * @bat_priv: the bat priv with all the mesh interface information * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, struct net_device *bridge) { struct net_device *dev = bat_priv->mesh_iface; u8 flags = BATADV_NO_FLAGS; if (!bridge) return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; if (!br_multicast_has_router_adjacent(dev, ETH_P_IP)) flags |= BATADV_MCAST_WANT_NO_RTR4; if (!br_multicast_has_router_adjacent(dev, ETH_P_IPV6)) flags |= BATADV_MCAST_WANT_NO_RTR6; return flags; } /** * batadv_mcast_mla_rtr_flags_get() - get multicast router flags * @bat_priv: the bat priv with all the mesh interface information * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this * node or behind its bridge. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, struct net_device *bridge) { u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; flags &= batadv_mcast_mla_rtr_flags_meshif_get(bat_priv, bridge); flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge); return flags; } /** * batadv_mcast_mla_forw_flags_get() - get multicast forwarding flags * @bat_priv: the bat priv with all the mesh interface information * * Checks if all active hard interfaces have an MTU larger or equal to 1280 * bytes (IPv6 minimum MTU). * * Return: BATADV_MCAST_HAVE_MC_PTYPE_CAPA if yes, BATADV_NO_FLAGS otherwise. */ static u8 batadv_mcast_mla_forw_flags_get(struct batadv_priv *bat_priv) { const struct batadv_hard_iface *hard_iface; struct list_head *iter; rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->net_dev->mtu < IPV6_MIN_MTU) { rcu_read_unlock(); return BATADV_NO_FLAGS; } } rcu_read_unlock(); return BATADV_MCAST_HAVE_MC_PTYPE_CAPA; } /** * batadv_mcast_mla_flags_get() - get the new multicast flags * @bat_priv: the bat priv with all the mesh interface information * * Return: A set of flags for the current/next TVLV, querier and * bridge state. */ static struct batadv_mcast_mla_flags batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv) { struct net_device *dev = bat_priv->mesh_iface; struct batadv_mcast_querier_state *qr4, *qr6; struct batadv_mcast_mla_flags mla_flags; struct net_device *bridge; bridge = batadv_mcast_get_bridge(dev); memset(&mla_flags, 0, sizeof(mla_flags)); mla_flags.enabled = 1; mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv, bridge); mla_flags.tvlv_flags |= batadv_mcast_mla_forw_flags_get(bat_priv); if (!bridge) return mla_flags; dev_put(bridge); mla_flags.bridged = 1; qr4 = &mla_flags.querier_ipv4; qr6 = &mla_flags.querier_ipv6; if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)) pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); qr4->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); qr4->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); qr6->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); qr6->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; /* 1) If no querier exists at all, then multicast listeners on * our local TT clients behind the bridge will keep silent. * 2) If the selected querier is on one of our local TT clients, * behind the bridge, then this querier might shadow multicast * listeners on our local TT clients, behind this bridge. * * In both cases, we will signalize other batman nodes that * we need all multicast traffic of the according protocol. */ if (!qr4->exists || qr4->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4; mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4; } if (!qr6->exists || qr6->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6; mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6; } return mla_flags; } /** * batadv_mcast_mla_is_duplicate() - check whether an address is in a list * @mcast_addr: the multicast address to check * @mcast_list: the list with multicast addresses to search in * * Return: true if the given address is already in the given list. * Otherwise returns false. */ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; hlist_for_each_entry(mcast_entry, mcast_list, list) if (batadv_compare_eth(mcast_entry->addr, mcast_addr)) return true; return false; } /** * batadv_mcast_mla_meshif_get_ipv4() - get meshif IPv4 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv4 multicast listeners residing * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_meshif_get_ipv4(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct batadv_hw_addr *new; struct in_device *in_dev; u8 mcast_addr[ETH_ALEN]; struct ip_mc_list *pmc; int ret = 0; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) return 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return 0; } for (pmc = rcu_dereference(in_dev->mc_list); pmc; pmc = rcu_dereference(pmc->next_rcu)) { if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv4_is_local_multicast(pmc->multiaddr)) continue; if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && !ipv4_is_local_multicast(pmc->multiaddr)) continue; ip_eth_mc_map(pmc->multiaddr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } rcu_read_unlock(); return ret; } /** * batadv_mcast_mla_meshif_get_ipv6() - get meshif IPv6 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv6 multicast listeners residing * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ #if IS_ENABLED(CONFIG_IPV6) static int batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct batadv_hw_addr *new; struct inet6_dev *in6_dev; u8 mcast_addr[ETH_ALEN]; struct ifmcaddr6 *pmc6; int ret = 0; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) return 0; rcu_read_lock(); in6_dev = __in6_dev_get(dev); if (!in6_dev) { rcu_read_unlock(); return 0; } for (pmc6 = rcu_dereference(in6_dev->mc_list); pmc6; pmc6 = rcu_dereference(pmc6->next)) { if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) continue; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr)) continue; if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) > IPV6_ADDR_SCOPE_LINKLOCAL) continue; ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } rcu_read_unlock(); return ret; } #else static inline int batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { return 0; } #endif /** * batadv_mcast_mla_meshif_get() - get meshif multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * If there is a bridge interface on top of dev, collect from that one * instead. Just like with IP addresses and routes, multicast listeners * will(/should) register to the bridge interface instead of an * enslaved bat0. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_meshif_get(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct net_device *bridge = batadv_mcast_get_bridge(dev); int ret4, ret6 = 0; if (bridge) dev = bridge; ret4 = batadv_mcast_mla_meshif_get_ipv4(dev, mcast_list, flags); if (ret4 < 0) goto out; ret6 = batadv_mcast_mla_meshif_get_ipv6(dev, mcast_list, flags); if (ret6 < 0) { ret4 = 0; goto out; } out: dev_put(bridge); return ret4 + ret6; } /** * batadv_mcast_mla_br_addr_cpy() - copy a bridge multicast address * @dst: destination to write to - a multicast MAC address * @src: source to read from - a multicast IP address * * Converts a given multicast IPv4/IPv6 address from a bridge * to its matching multicast MAC address and copies it into the given * destination buffer. * * Caller needs to make sure the destination buffer can hold * at least ETH_ALEN bytes. */ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) { if (src->proto == htons(ETH_P_IP)) ip_eth_mc_map(src->dst.ip4, dst); #if IS_ENABLED(CONFIG_IPV6) else if (src->proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&src->dst.ip6, dst); #endif else eth_zero_addr(dst); } /** * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners * @dev: a bridge slave whose bridge to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on foreign, non-mesh devices which we gave access to our mesh via * a bridge on top of the given mesh interface, dev, in the given * mcast_list. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_bridge_get(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); struct br_ip_list *br_ip_entry, *tmp; u8 tvlv_flags = flags->tvlv_flags; struct batadv_hw_addr *new; u8 mcast_addr[ETH_ALEN]; int ret; /* we don't need to detect these devices/listeners, the IGMP/MLD * snooping code of the Linux bridge already does that for us */ ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); if (ret < 0) goto out; list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { if (br_ip_entry->addr.proto == htons(ETH_P_IP)) { if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && !ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; } #if IS_ENABLED(CONFIG_IPV6) if (br_ip_entry->addr.proto == htons(ETH_P_IPV6)) { if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.dst.ip6)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.dst.ip6) > IPV6_ADDR_SCOPE_LINKLOCAL) continue; } #endif batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); } out: list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { list_del(&br_ip_entry->list); kfree(br_ip_entry); } return ret; } /** * batadv_mcast_mla_list_free() - free a list of multicast addresses * @mcast_list: the list to free * * Removes and frees all items in the given mcast_list. */ static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_retract() - clean up multicast listener announcements * @bat_priv: the bat priv with all the mesh interface information * @mcast_list: a list of addresses which should _not_ be removed * * Retracts the announcement of any multicast listener from the * translation table except the ones listed in the given mcast_list. * * If mcast_list is NULL then all are retracted. */ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list, list) { if (mcast_list && batadv_mcast_mla_is_duplicate(mcast_entry->addr, mcast_list)) continue; batadv_tt_local_remove(bat_priv, mcast_entry->addr, BATADV_NO_FLAGS, "mcast TT outdated", false); hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_add() - add multicast listener announcements * @bat_priv: the bat priv with all the mesh interface information * @mcast_list: a list of addresses which are going to get added * * Adds multicast listener announcements from the given mcast_list to the * translation table if they have not been added yet. */ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; if (!mcast_list) return; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { if (batadv_mcast_mla_is_duplicate(mcast_entry->addr, &bat_priv->mcast.mla_list)) continue; if (!batadv_tt_local_add(bat_priv->mesh_iface, mcast_entry->addr, BATADV_NO_FLAGS, BATADV_NULL_IFINDEX, BATADV_NO_MARK)) continue; hlist_del(&mcast_entry->list); hlist_add_head(&mcast_entry->list, &bat_priv->mcast.mla_list); } } /** * batadv_mcast_querier_log() - debug output regarding the querier status on * link * @bat_priv: the bat priv with all the mesh interface information * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD") * @old_state: the previous querier state on our link * @new_state: the new querier state on our link * * Outputs debug messages to the logging facility with log level 'mcast' * regarding changes to the querier status on the link which are relevant * to our multicast optimizations. * * Usually this is about whether a querier appeared or vanished in * our mesh or whether the querier is in the suboptimal position of being * behind our local bridge segment: Snooping switches will directly * forward listener reports to the querier, therefore batman-adv and * the bridge will potentially not see these listeners - the querier is * potentially shadowing listeners from us then. * * This is only interesting for nodes with a bridge on top of their * mesh interface. */ static void batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, struct batadv_mcast_querier_state *old_state, struct batadv_mcast_querier_state *new_state) { if (!old_state->exists && new_state->exists) batadv_info(bat_priv->mesh_iface, "%s Querier appeared\n", str_proto); else if (old_state->exists && !new_state->exists) batadv_info(bat_priv->mesh_iface, "%s Querier disappeared - multicast optimizations disabled\n", str_proto); else if (!bat_priv->mcast.mla_flags.bridged && !new_state->exists) batadv_info(bat_priv->mesh_iface, "No %s Querier present - multicast optimizations disabled\n", str_proto); if (new_state->exists) { if ((!old_state->shadowing && new_state->shadowing) || (!old_state->exists && new_state->shadowing)) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is behind our bridged segment: Might shadow listeners\n", str_proto); else if (old_state->shadowing && !new_state->shadowing) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is not behind our bridged segment\n", str_proto); } } /** * batadv_mcast_bridge_log() - debug output for topology changes in bridged * setups * @bat_priv: the bat priv with all the mesh interface information * @new_flags: flags indicating the new multicast state * * If no bridges are ever used on this node, then this function does nothing. * * Otherwise this function outputs debug information to the 'mcast' log level * which might be relevant to our multicast optimizations. * * More precisely, it outputs information when a bridge interface is added or * removed from a mesh interface. And when a bridge is present, it further * outputs information about the querier state which is relevant for the * multicast flags this node is going to set. */ static void batadv_mcast_bridge_log(struct batadv_priv *bat_priv, struct batadv_mcast_mla_flags *new_flags) { struct batadv_mcast_mla_flags *old_flags = &bat_priv->mcast.mla_flags; if (!old_flags->bridged && new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge added: Setting Unsnoopables(U)-flag\n"); else if (old_flags->bridged && !new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge removed: Unsetting Unsnoopables(U)-flag\n"); if (new_flags->bridged) { batadv_mcast_querier_log(bat_priv, "IGMP", &old_flags->querier_ipv4, &new_flags->querier_ipv4); batadv_mcast_querier_log(bat_priv, "MLD", &old_flags->querier_ipv6, &new_flags->querier_ipv6); } } /** * batadv_mcast_flags_log() - output debug information about mcast flag changes * @bat_priv: the bat priv with all the mesh interface information * @flags: TVLV flags indicating the new multicast state * * Whenever the multicast TVLV flags this node announces change, this function * should be used to notify userspace about the change. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { bool old_enabled = bat_priv->mcast.mla_flags.enabled; u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags; char str_old_flags[] = "[.... . .]"; sprintf(str_old_flags, "[%c%c%c%s%s%c]", (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ", !(old_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.'); batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Changing multicast flags from '%s' to '[%c%c%c%s%s%c]'\n", old_enabled ? str_old_flags : "<undefined>", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ", !(flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.'); } /** * batadv_mcast_mla_flags_update() - update multicast flags * @bat_priv: the bat priv with all the mesh interface information * @flags: flags indicating the new multicast state * * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. */ static void batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv, struct batadv_mcast_mla_flags *flags) { struct batadv_tvlv_mcast_data mcast_data; if (!memcmp(flags, &bat_priv->mcast.mla_flags, sizeof(*flags))) return; batadv_mcast_bridge_log(bat_priv, flags); batadv_mcast_flags_log(bat_priv, flags->tvlv_flags); mcast_data.flags = flags->tvlv_flags; memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, &mcast_data, sizeof(mcast_data)); bat_priv->mcast.mla_flags = *flags; } /** * __batadv_mcast_mla_update() - update the own MLAs * @bat_priv: the bat priv with all the mesh interface information * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * Note that non-conflicting reads and writes to bat_priv->mcast.mla_list * in batadv_mcast_mla_tt_retract() and batadv_mcast_mla_tt_add() are * ensured by the non-parallel execution of the worker this function * belongs to. */ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv) { struct net_device *mesh_iface = bat_priv->mesh_iface; struct hlist_head mcast_list = HLIST_HEAD_INIT; struct batadv_mcast_mla_flags flags; int ret; flags = batadv_mcast_mla_flags_get(bat_priv); ret = batadv_mcast_mla_meshif_get(mesh_iface, &mcast_list, &flags); if (ret < 0) goto out; ret = batadv_mcast_mla_bridge_get(mesh_iface, &mcast_list, &flags); if (ret < 0) goto out; spin_lock(&bat_priv->mcast.mla_lock); batadv_mcast_mla_tt_retract(bat_priv, &mcast_list); batadv_mcast_mla_tt_add(bat_priv, &mcast_list); batadv_mcast_mla_flags_update(bat_priv, &flags); spin_unlock(&bat_priv->mcast.mla_lock); out: batadv_mcast_mla_list_free(&mcast_list); } /** * batadv_mcast_mla_update() - update the own MLAs * @work: kernel work struct * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * In the end, reschedules the work timer. */ static void batadv_mcast_mla_update(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_mcast *priv_mcast; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work); bat_priv = container_of(priv_mcast, struct batadv_priv, mcast); __batadv_mcast_mla_update(bat_priv); batadv_mcast_start_timer(bat_priv); } /** * batadv_mcast_is_report_ipv4() - check for IGMP reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid IGMP report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) { if (ip_mc_check_igmp(skb) < 0) return false; switch (igmp_hdr(skb)->type) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: case IGMPV3_HOST_MEMBERSHIP_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv4() - check for optimized forwarding * potential * @bat_priv: the bat priv with all the mesh interface information * @skb: the IPv4 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory * allocation failure. */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct iphdr *iphdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv4(skb)) return -EINVAL; iphdr = ip_hdr(skb); /* link-local multicast listeners behind a bridge are * not snoopable (see RFC4541, section 2.1.2.2) */ if (ipv4_is_local_multicast(iphdr->daddr)) *is_unsnoopable = true; else *is_routable = ETH_P_IP; return 0; } /** * batadv_mcast_is_report_ipv6() - check for MLD reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid MLD report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) { if (ipv6_mc_check_mld(skb) < 0) return false; switch (icmp6_hdr(skb)->icmp6_type) { case ICMPV6_MGM_REPORT: case ICMPV6_MLD2_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv6() - check for optimized forwarding * potential * @bat_priv: the bat priv with all the mesh interface information * @skb: the IPv6 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct ipv6hdr *ip6hdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv6(skb)) return -EINVAL; ip6hdr = ipv6_hdr(skb); if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) < IPV6_ADDR_SCOPE_LINKLOCAL) return -EINVAL; /* link-local-all-nodes multicast listeners behind a bridge are * not snoopable (see RFC4541, section 3, paragraph 3) */ if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr)) *is_unsnoopable = true; else if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) > IPV6_ADDR_SCOPE_LINKLOCAL) *is_routable = ETH_P_IPV6; return 0; } /** * batadv_mcast_forw_mode_check() - check for optimized forwarding potential * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast frame to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given multicast ethernet frame has the potential to be * forwarded with a mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct ethhdr *ethhdr = eth_hdr(skb); if (!atomic_read(&bat_priv->multicast_mode)) return -EINVAL; switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, is_unsnoopable, is_routable); case ETH_P_IPV6: if (!IS_ENABLED(CONFIG_IPV6)) return -EINVAL; return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, is_unsnoopable, is_routable); default: return -EINVAL; } } /** * batadv_mcast_forw_want_all_ip_count() - count nodes with unspecific mcast * interest * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: ethernet header of a packet * * Return: the number of nodes which want all IPv4 multicast traffic if the * given ethhdr is from an IPv4 packet or the number of nodes which want all * IPv6 traffic if it matches an IPv6 packet. */ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return atomic_read(&bat_priv->mcast.num_want_all_ipv4); case ETH_P_IPV6: return atomic_read(&bat_priv->mcast.num_want_all_ipv6); default: /* we shouldn't be here... */ return 0; } } /** * batadv_mcast_forw_rtr_count() - count nodes with a multicast router * @bat_priv: the bat priv with all the mesh interface information * @protocol: the ethernet protocol type to count multicast routers for * * Return: the number of nodes which want all routable IPv4 multicast traffic * if the protocol is ETH_P_IP or the number of nodes which want all routable * IPv6 traffic if the protocol is ETH_P_IPV6. Otherwise returns 0. */ static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv, int protocol) { switch (protocol) { case ETH_P_IP: return atomic_read(&bat_priv->mcast.num_want_all_rtr4); case ETH_P_IPV6: return atomic_read(&bat_priv->mcast.num_want_all_rtr6); default: return 0; } } /** * batadv_mcast_forw_mode_by_count() - get forwarding mode by count * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to check * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * @count: the number of originators the multicast packet need to be sent to * * For a multicast packet with multiple destination originators, checks which * mode to use. For BATADV_FORW_MCAST it also encapsulates the packet with a * complete batman-adv multicast header. * * Return: * BATADV_FORW_MCAST: If all nodes have multicast packet routing * capabilities and an MTU >= 1280 on all hard interfaces (including us) * and the encapsulated multicast packet with all destination addresses * would still fit into an 1280 bytes batman-adv multicast packet * (excluding the outer ethernet frame) and we could successfully push * the full batman-adv multicast packet header. * BATADV_FORW_UCASTS: If the packet cannot be sent in a batman-adv * multicast packet and the amount of batman-adv unicast packets needed * is smaller or equal to the configured multicast fanout. * BATADV_FORW_BCAST: Otherwise. */ static enum batadv_forw_mode batadv_mcast_forw_mode_by_count(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable, int count) { unsigned int mcast_hdrlen = batadv_mcast_forw_packet_hdrlen(count); u8 own_tvlv_flags = bat_priv->mcast.mla_flags.tvlv_flags; if (!atomic_read(&bat_priv->mcast.num_no_mc_ptype_capa) && own_tvlv_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA && skb->len + mcast_hdrlen <= IPV6_MIN_MTU && batadv_mcast_forw_push(bat_priv, skb, vid, is_routable, count)) return BATADV_FORW_MCAST; if (count <= atomic_read(&bat_priv->multicast_fanout)) return BATADV_FORW_UCASTS; return BATADV_FORW_BCAST; } /** * batadv_mcast_forw_mode() - check on how to forward a multicast packet * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to check * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * * Return: The forwarding mode as enum batadv_forw_mode. */ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int *is_routable) { int ret, tt_count, ip_count, unsnoop_count, total_count; bool is_unsnoopable = false; struct ethhdr *ethhdr; int rtr_count = 0; ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable, is_routable); if (ret == -ENOMEM) return BATADV_FORW_NONE; else if (ret < 0) return BATADV_FORW_BCAST; ethhdr = eth_hdr(skb); tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest, BATADV_NO_FLAGS); ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr); unsnoop_count = !is_unsnoopable ? 0 : atomic_read(&bat_priv->mcast.num_want_all_unsnoopables); rtr_count = batadv_mcast_forw_rtr_count(bat_priv, *is_routable); total_count = tt_count + ip_count + unsnoop_count + rtr_count; if (!total_count) return BATADV_FORW_NONE; else if (unsnoop_count) return BATADV_FORW_BCAST; return batadv_mcast_forw_mode_by_count(bat_priv, skb, vid, *is_routable, total_count); } /** * batadv_mcast_forw_send_orig() - send a multicast packet to an originator * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to send * @vid: the vlan identifier * @orig_node: the originator to send the packet to * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, struct batadv_orig_node *orig_node) { /* Avoid sending multicast-in-unicast packets to other BLA * gateways - they already got the frame from the LAN side * we share with them. * TODO: Refactor to take BLA into account earlier, to avoid * reducing the mcast_fanout count. */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) { dev_kfree_skb(skb); return NET_XMIT_SUCCESS; } return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, orig_node, vid); } /** * batadv_mcast_forw_tt() - forwards a packet to multicast listeners * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any multicast * listener registered in the translation table. A transmission is performed * via a batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_tt(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; struct batadv_tt_orig_list_entry *orig_entry; struct batadv_tt_global_entry *tt_global; const u8 *addr = eth_hdr(skb)->h_dest; tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global) goto out; rcu_read_lock(); hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_entry->orig_node); } rcu_read_unlock(); batadv_tt_global_entry_put(tt_global); out: return ret; } /** * batadv_mcast_forw_want_all_ipv4() - forward to nodes with want-all-ipv4 * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV4 flag set. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_ipv4_list, mcast_want_all_ipv4_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all_ipv6() - forward to nodes with want-all-ipv6 * @bat_priv: the bat priv with all the mesh interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV6 flag set. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_ipv6_list, mcast_want_all_ipv6_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all() - forward packet to nodes in a want-all list * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV4 or BATADV_MCAST_WANT_ALL_IPV6 flag set. A * transmission is performed via a batman-adv unicast packet for each such * destination node. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_want_all(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_want_all_ipv4(bat_priv, skb, vid); case ETH_P_IPV6: return batadv_mcast_forw_want_all_ipv6(bat_priv, skb, vid); default: /* we shouldn't be here... */ return NET_XMIT_DROP; } } /** * batadv_mcast_forw_want_all_rtr4() - forward to nodes with want-all-rtr4 * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR4 flag unset. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_rtr4(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_rtr4_list, mcast_want_all_rtr4_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all_rtr6() - forward to nodes with want-all-rtr6 * @bat_priv: the bat priv with all the mesh interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR6 flag unset. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_rtr6(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_rtr6_list, mcast_want_all_rtr6_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_rtr() - forward packet to nodes in a want-all-rtr list * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR4 or BATADV_MCAST_WANT_NO_RTR6 flag unset. A * transmission is performed via a batman-adv unicast packet for each such * destination node. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_want_all_rtr4(bat_priv, skb, vid); case ETH_P_IPV6: return batadv_mcast_forw_want_all_rtr6(bat_priv, skb, vid); default: /* we shouldn't be here... */ return NET_XMIT_DROP; } } /** * batadv_mcast_forw_send() - send packet to any detected multicast recipient * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * * Sends copies of a frame with multicast destination to any node that signaled * interest in it, that is either via the translation table or the according * want-all flags. A transmission is performed via a batman-adv unicast packet * for each such destination node. * * The given skb is consumed/freed. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable) { int ret; ret = batadv_mcast_forw_tt(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } ret = batadv_mcast_forw_want_all(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } if (!is_routable) goto skip_mc_router; ret = batadv_mcast_forw_want_rtr(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } skip_mc_router: consume_skb(skb); return ret; } /** * batadv_mcast_want_unsnoop_update() - update unsnoop counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, * orig, has toggled then this method updates the counter and the list * accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node; struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) { atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) { atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv4_update() - update want-all-ipv4 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv4_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) { atomic_dec(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv6_update() - update want-all-ipv6 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv6_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) { atomic_dec(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_rtr4_node; struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) && orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) { atomic_inc(&bat_priv->mcast.num_want_all_rtr4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag unset to set */ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 && !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) { atomic_dec(&bat_priv->mcast.num_want_all_rtr4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_rtr6_node; struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) && orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) { atomic_inc(&bat_priv->mcast.num_want_all_rtr6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag unset to set */ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 && !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) { atomic_dec(&bat_priv->mcast.num_want_all_rtr6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_have_mc_ptype_update() - update multicast packet type counter * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_HAVE_MC_PTYPE_CAPA flag of this originator, orig, has * toggled then this method updates the counter accordingly. */ static void batadv_mcast_have_mc_ptype_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) && orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) atomic_inc(&bat_priv->mcast.num_no_mc_ptype_capa); /* switched from flag unset to set */ else if (mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA && !(orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA)) atomic_dec(&bat_priv->mcast.num_no_mc_ptype_capa); } /** * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV * @enabled: whether the originator has multicast TVLV support enabled * @tvlv_value: tvlv buffer containing the multicast flags * @tvlv_value_len: tvlv buffer length * * Return: multicast flags for the given tvlv buffer */ static u8 batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len) { u8 mcast_flags = BATADV_NO_FLAGS; if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags)) mcast_flags = *(u8 *)tvlv_value; if (!enabled) { mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; } /* remove redundant flags to avoid sending duplicate packets later */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) mcast_flags |= BATADV_MCAST_WANT_NO_RTR4; if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; return mcast_flags; } /** * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the multicast data * @tvlv_value_len: tvlv buffer length */ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); u8 mcast_flags; mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled, tvlv_value, tvlv_value_len); spin_lock_bh(&orig->mcast_handler_lock); if (orig_mcast_enabled && !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } else if (!orig_mcast_enabled && test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized); batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags); batadv_mcast_have_mc_ptype_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; spin_unlock_bh(&orig->mcast_handler_lock); } /** * batadv_mcast_init() - initialize the multicast optimizations structures * @bat_priv: the bat priv with all the mesh interface information */ void batadv_mcast_init(struct batadv_priv *bat_priv) { batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, NULL, NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); batadv_tvlv_handler_register(bat_priv, NULL, NULL, batadv_mcast_forw_tracker_tvlv_handler, BATADV_TVLV_MCAST_TRACKER, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update); batadv_mcast_start_timer(bat_priv); } /** * batadv_mcast_mesh_info_put() - put multicast info into a netlink message * @msg: buffer for the message * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 or error code. */ int batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv) { u32 flags = bat_priv->mcast.mla_flags.tvlv_flags; u32 flags_priv = BATADV_NO_FLAGS; if (bat_priv->mcast.mla_flags.bridged) { flags_priv |= BATADV_MCAST_FLAGS_BRIDGED; if (bat_priv->mcast.mla_flags.querier_ipv4.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS; if (bat_priv->mcast.mla_flags.querier_ipv6.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS; if (bat_priv->mcast.mla_flags.querier_ipv4.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING; if (bat_priv->mcast.mla_flags.querier_ipv6.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING; } if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, flags) || nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS_PRIV, flags_priv)) return -EMSGSIZE; return 0; } /** * batadv_mcast_flags_dump_entry() - dump one entry of the multicast flags table * to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @orig_node: originator to dump the multicast flags of * * Return: 0 or error code. */ static int batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_orig_node *orig_node) { void *hdr; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS); if (!hdr) return -ENOBUFS; genl_dump_check_consistent(cb, hdr); if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig)) { genlmsg_cancel(msg, hdr); return -EMSGSIZE; } if (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capabilities)) { if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, orig_node->mcast_flags)) { genlmsg_cancel(msg, hdr); return -EMSGSIZE; } } genlmsg_end(msg, hdr); return 0; } /** * batadv_mcast_flags_dump_bucket() - dump one bucket of the multicast flags * table to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hashtable *hash, unsigned int bucket, long *idx_skip) { struct batadv_orig_node *orig_node; long idx = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(orig_node, &hash->table[bucket], hash_entry) { if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capa_initialized)) continue; if (idx < *idx_skip) goto skip; if (batadv_mcast_flags_dump_entry(msg, portid, cb, orig_node)) { spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; } skip: idx++; } spin_unlock_bh(&hash->list_locks[bucket]); return 0; } /** * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @bat_priv: the bat priv with all the mesh interface information * @bucket: current bucket to dump * @idx: index in current bucket to the next entry to dump * * Return: 0 or error code. */ static int __batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, long *bucket, long *idx) { struct batadv_hashtable *hash = bat_priv->orig_hash; long bucket_tmp = *bucket; long idx_tmp = *idx; while (bucket_tmp < hash->size) { if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash, bucket_tmp, &idx_tmp)) break; bucket_tmp++; idx_tmp = 0; } *bucket = bucket_tmp; *idx = idx_tmp; return msg->len; } /** * batadv_mcast_netlink_get_primary() - get primary interface from netlink * callback * @cb: netlink callback structure * @primary_if: the primary interface pointer to return the result in * * Return: 0 or error code. */ static int batadv_mcast_netlink_get_primary(struct netlink_callback *cb, struct batadv_hard_iface **primary_if) { struct batadv_hard_iface *hard_iface = NULL; struct net_device *mesh_iface; struct batadv_priv *bat_priv; int ret = 0; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); hard_iface = batadv_primary_if_get_selected(bat_priv); if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } out: dev_put(mesh_iface); if (!ret && primary_if) *primary_if = hard_iface; else batadv_hardif_put(hard_iface); return ret; } /** * batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct batadv_priv *bat_priv; long *bucket = &cb->args[0]; long *idx = &cb->args[1]; int ret; ret = batadv_mcast_netlink_get_primary(cb, &primary_if); if (ret) return ret; bat_priv = netdev_priv(primary_if->mesh_iface); ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx); batadv_hardif_put(primary_if); return ret; } /** * batadv_mcast_free() - free the multicast optimizations structures * @bat_priv: the bat priv with all the mesh interface information */ void batadv_mcast_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->mcast.work); batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST_TRACKER, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2); /* safely calling outside of worker, as worker was canceled above */ batadv_mcast_mla_tt_retract(bat_priv, NULL); } /** * batadv_mcast_purge_orig() - reset originator global mcast state modifications * @orig: the originator which is going to get purged */ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) { struct batadv_priv *bat_priv = orig->bat_priv; spin_lock_bh(&orig->mcast_handler_lock); batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_rtr4_update(bat_priv, orig, BATADV_MCAST_WANT_NO_RTR4); batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_MCAST_WANT_NO_RTR6); batadv_mcast_have_mc_ptype_update(bat_priv, orig, BATADV_MCAST_HAVE_MC_PTYPE_CAPA); spin_unlock_bh(&orig->mcast_handler_lock); }
2 7 7 7 7 7 3 2 6 7 3 2 4 1 3 2 1 3 3 4 44 43 14 4 15 6 4 39 3 3 4 5 4 3 3 1 3 4 3 2 2 4 3 3 23 6 2 4 4 2 2 2 2 2 4 2 2 2 2 2 2 2 2 2 2 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 // SPDX-License-Identifier: GPL-2.0 #include <linux/capability.h> #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/proc_fs.h> #include <linux/ctype.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/init.h> #define LINE_SIZE 80 #include <asm/mtrr.h> #include "mtrr.h" #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) static const char *const mtrr_strings[MTRR_NUM_TYPES] = { "uncachable", /* 0 */ "write-combining", /* 1 */ "?", /* 2 */ "?", /* 3 */ "write-through", /* 4 */ "write-protect", /* 5 */ "write-back", /* 6 */ }; const char *mtrr_attrib_to_str(int x) { return (x <= 6) ? mtrr_strings[x] : "?"; } #ifdef CONFIG_PROC_FS static int mtrr_file_add(unsigned long base, unsigned long size, unsigned int type, bool increment, struct file *file, int page) { unsigned int *fcount = FILE_FCOUNT(file); int reg, max; max = num_var_ranges; if (fcount == NULL) { fcount = kcalloc(max, sizeof(*fcount), GFP_KERNEL); if (!fcount) return -ENOMEM; FILE_FCOUNT(file) = fcount; } if (!page) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) return -EINVAL; base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; } reg = mtrr_add_page(base, size, type, true); if (reg >= 0) ++fcount[reg]; return reg; } static int mtrr_file_del(unsigned long base, unsigned long size, struct file *file, int page) { unsigned int *fcount = FILE_FCOUNT(file); int reg; if (!page) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) return -EINVAL; base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; } reg = mtrr_del_page(-1, base, size); if (reg < 0) return reg; if (fcount == NULL) return reg; if (fcount[reg] < 1) return -EINVAL; --fcount[reg]; return reg; } /* * seq_file can seek but we ignore it. * * Format of control line: * "base=%Lx size=%Lx type=%s" or "disable=%d" */ static ssize_t mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) { int i, err; unsigned long reg; unsigned long long base, size; char *ptr; char line[LINE_SIZE]; int length; memset(line, 0, LINE_SIZE); len = min_t(size_t, len, LINE_SIZE - 1); length = strncpy_from_user(line, buf, len); if (length < 0) return length; ptr = line + length - 1; if (length && *ptr == '\n') *ptr = '\0'; if (!strncmp(line, "disable=", 8)) { reg = simple_strtoul(line + 8, &ptr, 0); err = mtrr_del_page(reg, 0, 0); if (err < 0) return err; return len; } if (strncmp(line, "base=", 5)) return -EINVAL; base = simple_strtoull(line + 5, &ptr, 0); ptr = skip_spaces(ptr); if (strncmp(ptr, "size=", 5)) return -EINVAL; size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; ptr = skip_spaces(ptr); if (strncmp(ptr, "type=", 5)) return -EINVAL; ptr = skip_spaces(ptr + 5); i = match_string(mtrr_strings, MTRR_NUM_TYPES, ptr); if (i < 0) return i; base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); if (err < 0) return err; return len; } static long mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) { int err = 0; mtrr_type type; unsigned long base; unsigned long size; struct mtrr_sentry sentry; struct mtrr_gentry gentry; void __user *arg = (void __user *) __arg; memset(&gentry, 0, sizeof(gentry)); switch (cmd) { case MTRRIOC_ADD_ENTRY: case MTRRIOC_SET_ENTRY: case MTRRIOC_DEL_ENTRY: case MTRRIOC_KILL_ENTRY: case MTRRIOC_ADD_PAGE_ENTRY: case MTRRIOC_SET_PAGE_ENTRY: case MTRRIOC_DEL_PAGE_ENTRY: case MTRRIOC_KILL_PAGE_ENTRY: if (copy_from_user(&sentry, arg, sizeof(sentry))) return -EFAULT; break; case MTRRIOC_GET_ENTRY: case MTRRIOC_GET_PAGE_ENTRY: if (copy_from_user(&gentry, arg, sizeof(gentry))) return -EFAULT; break; #ifdef CONFIG_COMPAT case MTRRIOC32_ADD_ENTRY: case MTRRIOC32_SET_ENTRY: case MTRRIOC32_DEL_ENTRY: case MTRRIOC32_KILL_ENTRY: case MTRRIOC32_ADD_PAGE_ENTRY: case MTRRIOC32_SET_PAGE_ENTRY: case MTRRIOC32_DEL_PAGE_ENTRY: case MTRRIOC32_KILL_PAGE_ENTRY: { struct mtrr_sentry32 __user *s32; s32 = (struct mtrr_sentry32 __user *)__arg; err = get_user(sentry.base, &s32->base); err |= get_user(sentry.size, &s32->size); err |= get_user(sentry.type, &s32->type); if (err) return err; break; } case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { struct mtrr_gentry32 __user *g32; g32 = (struct mtrr_gentry32 __user *)__arg; err = get_user(gentry.regnum, &g32->regnum); err |= get_user(gentry.base, &g32->base); err |= get_user(gentry.size, &g32->size); err |= get_user(gentry.type, &g32->type); if (err) return err; break; } #endif } switch (cmd) { default: return -ENOTTY; case MTRRIOC_ADD_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_ADD_ENTRY: #endif err = mtrr_file_add(sentry.base, sentry.size, sentry.type, true, file, 0); break; case MTRRIOC_SET_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_SET_ENTRY: #endif err = mtrr_add(sentry.base, sentry.size, sentry.type, false); break; case MTRRIOC_DEL_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_DEL_ENTRY: #endif err = mtrr_file_del(sentry.base, sentry.size, file, 0); break; case MTRRIOC_KILL_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_KILL_ENTRY: #endif err = mtrr_del(-1, sentry.base, sentry.size); break; case MTRRIOC_GET_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_GET_ENTRY: #endif if (gentry.regnum >= num_var_ranges) return -EINVAL; mtrr_if->get(gentry.regnum, &base, &size, &type); /* Hide entries that go above 4GB */ if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))) gentry.base = gentry.size = gentry.type = 0; else { gentry.base = base << PAGE_SHIFT; gentry.size = size << PAGE_SHIFT; gentry.type = type; } break; case MTRRIOC_ADD_PAGE_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_ADD_PAGE_ENTRY: #endif err = mtrr_file_add(sentry.base, sentry.size, sentry.type, true, file, 1); break; case MTRRIOC_SET_PAGE_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_SET_PAGE_ENTRY: #endif err = mtrr_add_page(sentry.base, sentry.size, sentry.type, false); break; case MTRRIOC_DEL_PAGE_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_DEL_PAGE_ENTRY: #endif err = mtrr_file_del(sentry.base, sentry.size, file, 1); break; case MTRRIOC_KILL_PAGE_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_KILL_PAGE_ENTRY: #endif err = mtrr_del_page(-1, sentry.base, sentry.size); break; case MTRRIOC_GET_PAGE_ENTRY: #ifdef CONFIG_COMPAT case MTRRIOC32_GET_PAGE_ENTRY: #endif if (gentry.regnum >= num_var_ranges) return -EINVAL; mtrr_if->get(gentry.regnum, &base, &size, &type); /* Hide entries that would overflow */ if (size != (__typeof__(gentry.size))size) gentry.base = gentry.size = gentry.type = 0; else { gentry.base = base; gentry.size = size; gentry.type = type; } break; } if (err) return err; switch (cmd) { case MTRRIOC_GET_ENTRY: case MTRRIOC_GET_PAGE_ENTRY: if (copy_to_user(arg, &gentry, sizeof(gentry))) err = -EFAULT; break; #ifdef CONFIG_COMPAT case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { struct mtrr_gentry32 __user *g32; g32 = (struct mtrr_gentry32 __user *)__arg; err = put_user(gentry.base, &g32->base); err |= put_user(gentry.size, &g32->size); err |= put_user(gentry.regnum, &g32->regnum); err |= put_user(gentry.type, &g32->type); break; } #endif } return err; } static int mtrr_close(struct inode *ino, struct file *file) { unsigned int *fcount = FILE_FCOUNT(file); int i, max; if (fcount != NULL) { max = num_var_ranges; for (i = 0; i < max; ++i) { while (fcount[i] > 0) { mtrr_del(i, 0, 0); --fcount[i]; } } kfree(fcount); FILE_FCOUNT(file) = NULL; } return single_release(ino, file); } static int mtrr_seq_show(struct seq_file *seq, void *offset) { char factor; int i, max; mtrr_type type; unsigned long base, size; max = num_var_ranges; for (i = 0; i < max; i++) { mtrr_if->get(i, &base, &size, &type); if (size == 0) { mtrr_usage_table[i] = 0; continue; } if (size < (0x100000 >> PAGE_SHIFT)) { /* less than 1MB */ factor = 'K'; size <<= PAGE_SHIFT - 10; } else { factor = 'M'; size >>= 20 - PAGE_SHIFT; } /* Base can be > 32bit */ seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", i, base, base >> (20 - PAGE_SHIFT), size, factor, mtrr_usage_table[i], mtrr_attrib_to_str(type)); } return 0; } static int mtrr_open(struct inode *inode, struct file *file) { if (!mtrr_if) return -EIO; if (!mtrr_if->get) return -ENXIO; if (!capable(CAP_SYS_ADMIN)) return -EPERM; return single_open(file, mtrr_seq_show, NULL); } static const struct proc_ops mtrr_proc_ops = { .proc_open = mtrr_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_write = mtrr_write, .proc_ioctl = mtrr_ioctl, #ifdef CONFIG_COMPAT .proc_compat_ioctl = mtrr_ioctl, #endif .proc_release = mtrr_close, }; static int __init mtrr_if_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; if ((!cpu_has(c, X86_FEATURE_MTRR)) && (!cpu_has(c, X86_FEATURE_K6_MTRR)) && (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) return -ENODEV; proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_proc_ops); return 0; } arch_initcall(mtrr_if_init); #endif /* CONFIG_PROC_FS */
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 2 1 2 4 3 4 4 5 4 4 4 4 1 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 // SPDX-License-Identifier: GPL-2.0+ /* * devices.c * (C) Copyright 1999 Randy Dunlap. * (C) Copyright 1999,2000 Thomas Sailer <sailer@ife.ee.ethz.ch>. * (proc file per device) * (C) Copyright 1999 Deti Fliegl (new USB architecture) * ************************************************************* * * <mountpoint>/devices contains USB topology, device, config, class, * interface, & endpoint data. * * I considered using /dev/bus/usb/device# for each device * as it is attached or detached, but I didn't like this for some * reason -- maybe it's just too deep of a directory structure. * I also don't like looking in multiple places to gather and view * the data. Having only one file for ./devices also prevents race * conditions that could arise if a program was reading device info * for devices that are being removed (unplugged). (That is, the * program may find a directory for devnum_12 then try to open it, * but it was just unplugged, so the directory is now deleted. * But programs would just have to be prepared for situations like * this in any plug-and-play environment.) * * 1999-12-16: Thomas Sailer <sailer@ife.ee.ethz.ch> * Converted the whole proc stuff to real * read methods. Now not the whole device list needs to fit * into one page, only the device list for one bus. * Added a poll method to /sys/kernel/debug/usb/devices, to wake * up an eventual usbd * 2000-01-04: Thomas Sailer <sailer@ife.ee.ethz.ch> * Turned into its own filesystem * 2000-07-05: Ashley Montanaro <ashley@compsoc.man.ac.uk> * Converted file reading routine to dump to buffer once * per device, not per bus */ #include <linux/fs.h> #include <linux/mm.h> #include <linux/gfp.h> #include <linux/usb.h> #include <linux/usbdevice_fs.h> #include <linux/usb/hcd.h> #include <linux/mutex.h> #include <linux/uaccess.h> #include "usb.h" /* Define ALLOW_SERIAL_NUMBER if you want to see the serial number of devices */ #define ALLOW_SERIAL_NUMBER static const char format_topo[] = /* T: Bus=dd Lev=dd Prnt=dd Port=dd Cnt=dd Dev#=ddd Spd=dddd MxCh=dd */ "\nT: Bus=%2.2d Lev=%2.2d Prnt=%2.2d Port=%2.2d Cnt=%2.2d Dev#=%3d Spd=%-4s MxCh=%2d\n"; static const char format_string_manufacturer[] = /* S: Manufacturer=xxxx */ "S: Manufacturer=%.100s\n"; static const char format_string_product[] = /* S: Product=xxxx */ "S: Product=%.100s\n"; #ifdef ALLOW_SERIAL_NUMBER static const char format_string_serialnumber[] = /* S: SerialNumber=xxxx */ "S: SerialNumber=%.100s\n"; #endif static const char format_bandwidth[] = /* B: Alloc=ddd/ddd us (xx%), #Int=ddd, #Iso=ddd */ "B: Alloc=%3d/%3d us (%2d%%), #Int=%3d, #Iso=%3d\n"; static const char format_device1[] = /* D: Ver=xx.xx Cls=xx(sssss) Sub=xx Prot=xx MxPS=dd #Cfgs=dd */ "D: Ver=%2x.%02x Cls=%02x(%-5s) Sub=%02x Prot=%02x MxPS=%2d #Cfgs=%3d\n"; static const char format_device2[] = /* P: Vendor=xxxx ProdID=xxxx Rev=xx.xx */ "P: Vendor=%04x ProdID=%04x Rev=%2x.%02x\n"; static const char format_config[] = /* C: #Ifs=dd Cfg#=dd Atr=xx MPwr=dddmA */ "C:%c #Ifs=%2d Cfg#=%2d Atr=%02x MxPwr=%3dmA\n"; static const char format_iad[] = /* A: FirstIf#=dd IfCount=dd Cls=xx(sssss) Sub=xx Prot=xx */ "A: FirstIf#=%2d IfCount=%2d Cls=%02x(%-5s) Sub=%02x Prot=%02x\n"; static const char format_iface[] = /* I: If#=dd Alt=dd #EPs=dd Cls=xx(sssss) Sub=xx Prot=xx Driver=xxxx*/ "I:%c If#=%2d Alt=%2d #EPs=%2d Cls=%02x(%-5s) Sub=%02x Prot=%02x Driver=%s\n"; static const char format_endpt[] = /* E: Ad=xx(s) Atr=xx(ssss) MxPS=dddd Ivl=D?s */ "E: Ad=%02x(%c) Atr=%02x(%-4s) MxPS=%4d Ivl=%d%cs\n"; struct class_info { int class; char *class_name; }; static const struct class_info clas_info[] = { /* max. 5 chars. per name string */ {USB_CLASS_PER_INTERFACE, ">ifc"}, {USB_CLASS_AUDIO, "audio"}, {USB_CLASS_COMM, "comm."}, {USB_CLASS_HID, "HID"}, {USB_CLASS_PHYSICAL, "PID"}, {USB_CLASS_STILL_IMAGE, "still"}, {USB_CLASS_PRINTER, "print"}, {USB_CLASS_MASS_STORAGE, "stor."}, {USB_CLASS_HUB, "hub"}, {USB_CLASS_CDC_DATA, "data"}, {USB_CLASS_CSCID, "scard"}, {USB_CLASS_CONTENT_SEC, "c-sec"}, {USB_CLASS_VIDEO, "video"}, {USB_CLASS_PERSONAL_HEALTHCARE, "perhc"}, {USB_CLASS_AUDIO_VIDEO, "av"}, {USB_CLASS_BILLBOARD, "blbrd"}, {USB_CLASS_USB_TYPE_C_BRIDGE, "bridg"}, {USB_CLASS_WIRELESS_CONTROLLER, "wlcon"}, {USB_CLASS_MISC, "misc"}, {USB_CLASS_APP_SPEC, "app."}, {USB_CLASS_VENDOR_SPEC, "vend."}, {-1, "unk."} /* leave as last */ }; /*****************************************************************/ static const char *class_decode(const int class) { int ix; for (ix = 0; clas_info[ix].class != -1; ix++) if (clas_info[ix].class == class) break; return clas_info[ix].class_name; } static char *usb_dump_endpoint_descriptor(int speed, char *start, char *end, const struct usb_endpoint_descriptor *desc) { char dir, unit, *type; unsigned interval, bandwidth = 1; if (start > end) return start; dir = usb_endpoint_dir_in(desc) ? 'I' : 'O'; if (speed == USB_SPEED_HIGH) bandwidth = usb_endpoint_maxp_mult(desc); /* this isn't checking for illegal values */ switch (usb_endpoint_type(desc)) { case USB_ENDPOINT_XFER_CONTROL: type = "Ctrl"; dir = 'B'; /* ctrl is bidirectional */ break; case USB_ENDPOINT_XFER_ISOC: type = "Isoc"; break; case USB_ENDPOINT_XFER_BULK: type = "Bulk"; break; case USB_ENDPOINT_XFER_INT: type = "Int."; break; default: /* "can't happen" */ return start; } interval = usb_decode_interval(desc, speed); if (interval % 1000) { unit = 'u'; } else { unit = 'm'; interval /= 1000; } start += sprintf(start, format_endpt, desc->bEndpointAddress, dir, desc->bmAttributes, type, usb_endpoint_maxp(desc) * bandwidth, interval, unit); return start; } static char *usb_dump_interface_descriptor(char *start, char *end, const struct usb_interface_cache *intfc, const struct usb_interface *iface, int setno) { const struct usb_interface_descriptor *desc; const char *driver_name = ""; int active = 0; if (start > end) return start; desc = &intfc->altsetting[setno].desc; if (iface) { driver_name = (iface->dev.driver ? iface->dev.driver->name : "(none)"); active = (desc == &iface->cur_altsetting->desc); } start += sprintf(start, format_iface, active ? '*' : ' ', /* mark active altsetting */ desc->bInterfaceNumber, desc->bAlternateSetting, desc->bNumEndpoints, desc->bInterfaceClass, class_decode(desc->bInterfaceClass), desc->bInterfaceSubClass, desc->bInterfaceProtocol, driver_name); return start; } static char *usb_dump_interface(int speed, char *start, char *end, const struct usb_interface_cache *intfc, const struct usb_interface *iface, int setno) { const struct usb_host_interface *desc = &intfc->altsetting[setno]; int i; start = usb_dump_interface_descriptor(start, end, intfc, iface, setno); for (i = 0; i < desc->desc.bNumEndpoints; i++) { start = usb_dump_endpoint_descriptor(speed, start, end, &desc->endpoint[i].desc); } return start; } static char *usb_dump_iad_descriptor(char *start, char *end, const struct usb_interface_assoc_descriptor *iad) { if (start > end) return start; start += sprintf(start, format_iad, iad->bFirstInterface, iad->bInterfaceCount, iad->bFunctionClass, class_decode(iad->bFunctionClass), iad->bFunctionSubClass, iad->bFunctionProtocol); return start; } /* TBD: * 0. TBDs * 1. marking active interface altsettings (code lists all, but should mark * which ones are active, if any) */ static char *usb_dump_config_descriptor(char *start, char *end, const struct usb_config_descriptor *desc, int active, int speed) { int mul; if (start > end) return start; if (speed >= USB_SPEED_SUPER) mul = 8; else mul = 2; start += sprintf(start, format_config, /* mark active/actual/current cfg. */ active ? '*' : ' ', desc->bNumInterfaces, desc->bConfigurationValue, desc->bmAttributes, desc->bMaxPower * mul); return start; } static char *usb_dump_config(int speed, char *start, char *end, const struct usb_host_config *config, int active) { int i, j; struct usb_interface_cache *intfc; struct usb_interface *interface; if (start > end) return start; if (!config) /* getting these some in 2.3.7; none in 2.3.6 */ return start + sprintf(start, "(null Cfg. desc.)\n"); start = usb_dump_config_descriptor(start, end, &config->desc, active, speed); for (i = 0; i < USB_MAXIADS; i++) { if (config->intf_assoc[i] == NULL) break; start = usb_dump_iad_descriptor(start, end, config->intf_assoc[i]); } for (i = 0; i < config->desc.bNumInterfaces; i++) { intfc = config->intf_cache[i]; interface = config->interface[i]; for (j = 0; j < intfc->num_altsetting; j++) { start = usb_dump_interface(speed, start, end, intfc, interface, j); } } return start; } /* * Dump the different USB descriptors. */ static char *usb_dump_device_descriptor(char *start, char *end, const struct usb_device_descriptor *desc) { u16 bcdUSB = le16_to_cpu(desc->bcdUSB); u16 bcdDevice = le16_to_cpu(desc->bcdDevice); if (start > end) return start; start += sprintf(start, format_device1, bcdUSB >> 8, bcdUSB & 0xff, desc->bDeviceClass, class_decode(desc->bDeviceClass), desc->bDeviceSubClass, desc->bDeviceProtocol, desc->bMaxPacketSize0, desc->bNumConfigurations); if (start > end) return start; start += sprintf(start, format_device2, le16_to_cpu(desc->idVendor), le16_to_cpu(desc->idProduct), bcdDevice >> 8, bcdDevice & 0xff); return start; } /* * Dump the different strings that this device holds. */ static char *usb_dump_device_strings(char *start, char *end, struct usb_device *dev) { if (start > end) return start; if (dev->manufacturer) start += sprintf(start, format_string_manufacturer, dev->manufacturer); if (start > end) goto out; if (dev->product) start += sprintf(start, format_string_product, dev->product); if (start > end) goto out; #ifdef ALLOW_SERIAL_NUMBER if (dev->serial) start += sprintf(start, format_string_serialnumber, dev->serial); #endif out: return start; } static char *usb_dump_desc(char *start, char *end, struct usb_device *dev) { int i; start = usb_dump_device_descriptor(start, end, &dev->descriptor); start = usb_dump_device_strings(start, end, dev); for (i = 0; i < dev->descriptor.bNumConfigurations; i++) { start = usb_dump_config(dev->speed, start, end, dev->config + i, /* active ? */ (dev->config + i) == dev->actconfig); } return start; } /*****************************************************************/ /* This is a recursive function. Parameters: * buffer - the user-space buffer to write data into * nbytes - the maximum number of bytes to write * skip_bytes - the number of bytes to skip before writing anything * file_offset - the offset into the devices file on completion * The caller must own the device lock. */ static ssize_t usb_device_dump(char __user **buffer, size_t *nbytes, loff_t *skip_bytes, loff_t *file_offset, struct usb_device *usbdev, struct usb_bus *bus, int level, int index, int count) { int chix; int ret, cnt = 0; int parent_devnum = 0; char *pages_start, *data_end, *speed; unsigned int length; ssize_t total_written = 0; struct usb_device *childdev = NULL; /* don't bother with anything else if we're not writing any data */ if (*nbytes <= 0) return 0; if (level > MAX_TOPO_LEVEL) return 0; /* allocate 2^1 pages = 8K (on i386); * should be more than enough for one device */ pages_start = (char *)__get_free_pages(GFP_NOIO, 1); if (!pages_start) return -ENOMEM; if (usbdev->parent && usbdev->parent->devnum != -1) parent_devnum = usbdev->parent->devnum; /* * So the root hub's parent is 0 and any device that is * plugged into the root hub has a parent of 0. */ switch (usbdev->speed) { case USB_SPEED_LOW: speed = "1.5"; break; case USB_SPEED_UNKNOWN: /* usb 1.1 root hub code */ case USB_SPEED_FULL: speed = "12"; break; case USB_SPEED_HIGH: speed = "480"; break; case USB_SPEED_SUPER: speed = "5000"; break; case USB_SPEED_SUPER_PLUS: speed = "10000"; break; default: speed = "??"; } data_end = pages_start + sprintf(pages_start, format_topo, bus->busnum, level, parent_devnum, index, count, usbdev->devnum, speed, usbdev->maxchild); /* * level = topology-tier level; * parent_devnum = parent device number; * index = parent's connector number; * count = device count at this level */ /* If this is the root hub, display the bandwidth information */ if (level == 0) { int max; /* super/high speed reserves 80%, full/low reserves 90% */ if (usbdev->speed == USB_SPEED_HIGH || usbdev->speed >= USB_SPEED_SUPER) max = 800; else max = FRAME_TIME_MAX_USECS_ALLOC; /* report "average" periodic allocation over a microsecond. * the schedules are actually bursty, HCDs need to deal with * that and just compute/report this average. */ data_end += sprintf(data_end, format_bandwidth, bus->bandwidth_allocated, max, (100 * bus->bandwidth_allocated + max / 2) / max, bus->bandwidth_int_reqs, bus->bandwidth_isoc_reqs); } data_end = usb_dump_desc(data_end, pages_start + (2 * PAGE_SIZE) - 256, usbdev); if (data_end > (pages_start + (2 * PAGE_SIZE) - 256)) data_end += sprintf(data_end, "(truncated)\n"); length = data_end - pages_start; /* if we can start copying some data to the user */ if (length > *skip_bytes) { length -= *skip_bytes; if (length > *nbytes) length = *nbytes; if (copy_to_user(*buffer, pages_start + *skip_bytes, length)) { free_pages((unsigned long)pages_start, 1); return -EFAULT; } *nbytes -= length; *file_offset += length; total_written += length; *buffer += length; *skip_bytes = 0; } else *skip_bytes -= length; free_pages((unsigned long)pages_start, 1); /* Now look at all of this device's children. */ usb_hub_for_each_child(usbdev, chix, childdev) { usb_lock_device(childdev); ret = usb_device_dump(buffer, nbytes, skip_bytes, file_offset, childdev, bus, level + 1, chix - 1, ++cnt); usb_unlock_device(childdev); if (ret == -EFAULT) return total_written; total_written += ret; } return total_written; } static ssize_t usb_device_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { struct usb_bus *bus; ssize_t ret, total_written = 0; loff_t skip_bytes = *ppos; int id; if (*ppos < 0) return -EINVAL; if (nbytes <= 0) return 0; mutex_lock(&usb_bus_idr_lock); /* print devices for all busses */ idr_for_each_entry(&usb_bus_idr, bus, id) { /* recurse through all children of the root hub */ if (!bus_to_hcd(bus)->rh_registered) continue; usb_lock_device(bus->root_hub); ret = usb_device_dump(&buf, &nbytes, &skip_bytes, ppos, bus->root_hub, bus, 0, 0, 0); usb_unlock_device(bus->root_hub); if (ret < 0) { mutex_unlock(&usb_bus_idr_lock); return ret; } total_written += ret; } mutex_unlock(&usb_bus_idr_lock); return total_written; } const struct file_operations usbfs_devices_fops = { .llseek = no_seek_end_llseek, .read = usb_device_read, };
4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 4 3 3 3 1 1 2 2 2 3 5 4 4 4 4 2 4 4 4 4 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 // SPDX-License-Identifier: GPL-2.0 /* * linux/drivers/scsi/scsi_proc.c * * The functions in this file provide an interface between * the PROC file system and the SCSI device drivers * It is mainly used for debugging, statistics and to pass * information directly to the lowlevel driver. * * (c) 1995 Michael Neuffer neuffer@goofy.zdv.uni-mainz.de * Version: 0.99.8 last change: 95/09/13 * * generic command parser provided by: * Andreas Heilwagen <crashcar@informatik.uni-koblenz.de> * * generic_proc_info() support of xxxx_info() by: * Michael A. Griffith <grif@acm.org> */ #include <linux/module.h> #include <linux/init.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/errno.h> #include <linux/blkdev.h> #include <linux/seq_file.h> #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/uaccess.h> #include <scsi/scsi.h> #include <scsi/scsi_device.h> #include <scsi/scsi_host.h> #include <scsi/scsi_transport.h> #include "scsi_priv.h" #include "scsi_logging.h" /* 4K page size, but our output routines, use some slack for overruns */ #define PROC_BLOCK_SIZE (3*1024) static struct proc_dir_entry *proc_scsi; /* Protects scsi_proc_list */ static DEFINE_MUTEX(global_host_template_mutex); static LIST_HEAD(scsi_proc_list); /** * struct scsi_proc_entry - (host template, SCSI proc dir) association * @entry: entry in scsi_proc_list. * @sht: SCSI host template associated with the procfs directory. * @proc_dir: procfs directory associated with the SCSI host template. * @present: Number of SCSI hosts instantiated for @sht. */ struct scsi_proc_entry { struct list_head entry; const struct scsi_host_template *sht; struct proc_dir_entry *proc_dir; unsigned int present; }; static ssize_t proc_scsi_host_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct Scsi_Host *shost = pde_data(file_inode(file)); ssize_t ret = -ENOMEM; char *page; if (count > PROC_BLOCK_SIZE) return -EOVERFLOW; if (!shost->hostt->write_info) return -EINVAL; page = (char *)__get_free_page(GFP_KERNEL); if (page) { ret = -EFAULT; if (copy_from_user(page, buf, count)) goto out; ret = shost->hostt->write_info(shost, page, count); } out: free_page((unsigned long)page); return ret; } static int proc_scsi_show(struct seq_file *m, void *v) { struct Scsi_Host *shost = m->private; return shost->hostt->show_info(m, shost); } static int proc_scsi_host_open(struct inode *inode, struct file *file) { return single_open_size(file, proc_scsi_show, pde_data(inode), 4 * PAGE_SIZE); } static struct scsi_proc_entry * __scsi_lookup_proc_entry(const struct scsi_host_template *sht) { struct scsi_proc_entry *e; lockdep_assert_held(&global_host_template_mutex); list_for_each_entry(e, &scsi_proc_list, entry) if (e->sht == sht) return e; return NULL; } static struct scsi_proc_entry * scsi_lookup_proc_entry(const struct scsi_host_template *sht) { struct scsi_proc_entry *e; mutex_lock(&global_host_template_mutex); e = __scsi_lookup_proc_entry(sht); mutex_unlock(&global_host_template_mutex); return e; } /** * scsi_template_proc_dir() - returns the procfs dir for a SCSI host template * @sht: SCSI host template pointer. */ struct proc_dir_entry * scsi_template_proc_dir(const struct scsi_host_template *sht) { struct scsi_proc_entry *e = scsi_lookup_proc_entry(sht); return e ? e->proc_dir : NULL; } EXPORT_SYMBOL_GPL(scsi_template_proc_dir); static const struct proc_ops proc_scsi_ops = { .proc_open = proc_scsi_host_open, .proc_release = single_release, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_write = proc_scsi_host_write }; /** * scsi_proc_hostdir_add - Create directory in /proc for a scsi host * @sht: owner of this directory * * Sets sht->proc_dir to the new directory. */ int scsi_proc_hostdir_add(const struct scsi_host_template *sht) { struct scsi_proc_entry *e; int ret; if (!sht->show_info) return 0; mutex_lock(&global_host_template_mutex); e = __scsi_lookup_proc_entry(sht); if (!e) { e = kzalloc(sizeof(*e), GFP_KERNEL); if (!e) { ret = -ENOMEM; goto unlock; } } if (e->present++) goto success; e->proc_dir = proc_mkdir(sht->proc_name, proc_scsi); if (!e->proc_dir) { printk(KERN_ERR "%s: proc_mkdir failed for %s\n", __func__, sht->proc_name); ret = -ENOMEM; goto unlock; } e->sht = sht; list_add_tail(&e->entry, &scsi_proc_list); success: e = NULL; ret = 0; unlock: mutex_unlock(&global_host_template_mutex); kfree(e); return ret; } /** * scsi_proc_hostdir_rm - remove directory in /proc for a scsi host * @sht: owner of directory */ void scsi_proc_hostdir_rm(const struct scsi_host_template *sht) { struct scsi_proc_entry *e; if (!sht->show_info) return; mutex_lock(&global_host_template_mutex); e = __scsi_lookup_proc_entry(sht); if (e && !--e->present) { remove_proc_entry(sht->proc_name, proc_scsi); list_del(&e->entry); kfree(e); } mutex_unlock(&global_host_template_mutex); } /** * scsi_proc_host_add - Add entry for this host to appropriate /proc dir * @shost: host to add */ void scsi_proc_host_add(struct Scsi_Host *shost) { const struct scsi_host_template *sht = shost->hostt; struct scsi_proc_entry *e; struct proc_dir_entry *p; char name[10]; if (!sht->show_info) return; e = scsi_lookup_proc_entry(sht); if (!e) goto err; sprintf(name,"%d", shost->host_no); p = proc_create_data(name, S_IRUGO | S_IWUSR, e->proc_dir, &proc_scsi_ops, shost); if (!p) goto err; return; err: shost_printk(KERN_ERR, shost, "%s: Failed to register host (%s failed)\n", __func__, e ? "proc_create_data()" : "scsi_proc_hostdir_add()"); } /** * scsi_proc_host_rm - remove this host's entry from /proc * @shost: which host */ void scsi_proc_host_rm(struct Scsi_Host *shost) { const struct scsi_host_template *sht = shost->hostt; struct scsi_proc_entry *e; char name[10]; if (!sht->show_info) return; e = scsi_lookup_proc_entry(sht); if (!e) return; sprintf(name,"%d", shost->host_no); remove_proc_entry(name, e->proc_dir); } /** * proc_print_scsidevice - return data about this host * @dev: A scsi device * @data: &struct seq_file to output to. * * Description: prints Host, Channel, Id, Lun, Vendor, Model, Rev, Type, * and revision. */ static int proc_print_scsidevice(struct device *dev, void *data) { struct scsi_device *sdev; struct seq_file *s = data; int i; if (!scsi_is_sdev_device(dev)) goto out; sdev = to_scsi_device(dev); seq_printf(s, "Host: scsi%d Channel: %02d Id: %02d Lun: %02llu\n Vendor: ", sdev->host->host_no, sdev->channel, sdev->id, sdev->lun); for (i = 0; i < 8; i++) { if (sdev->vendor[i] >= 0x20) seq_putc(s, sdev->vendor[i]); else seq_putc(s, ' '); } seq_puts(s, " Model: "); for (i = 0; i < 16; i++) { if (sdev->model[i] >= 0x20) seq_putc(s, sdev->model[i]); else seq_putc(s, ' '); } seq_puts(s, " Rev: "); for (i = 0; i < 4; i++) { if (sdev->rev[i] >= 0x20) seq_putc(s, sdev->rev[i]); else seq_putc(s, ' '); } seq_putc(s, '\n'); seq_printf(s, " Type: %s ", scsi_device_type(sdev->type)); seq_printf(s, " ANSI SCSI revision: %02x", sdev->scsi_level - (sdev->scsi_level > 1)); if (sdev->scsi_level == 2) seq_puts(s, " CCS\n"); else seq_putc(s, '\n'); out: return 0; } /** * scsi_add_single_device - Respond to user request to probe for/add device * @host: user-supplied decimal integer * @channel: user-supplied decimal integer * @id: user-supplied decimal integer * @lun: user-supplied decimal integer * * Description: called by writing "scsi add-single-device" to /proc/scsi/scsi. * * does scsi_host_lookup() and either user_scan() if that transport * type supports it, or else scsi_scan_host_selected() * * Note: this seems to be aimed exclusively at SCSI parallel busses. */ static int scsi_add_single_device(uint host, uint channel, uint id, uint lun) { struct Scsi_Host *shost; int error = -ENXIO; shost = scsi_host_lookup(host); if (!shost) return error; if (shost->transportt->user_scan) error = shost->transportt->user_scan(shost, channel, id, lun); else error = scsi_scan_host_selected(shost, channel, id, lun, SCSI_SCAN_MANUAL); scsi_host_put(shost); return error; } /** * scsi_remove_single_device - Respond to user request to remove a device * @host: user-supplied decimal integer * @channel: user-supplied decimal integer * @id: user-supplied decimal integer * @lun: user-supplied decimal integer * * Description: called by writing "scsi remove-single-device" to * /proc/scsi/scsi. Does a scsi_device_lookup() and scsi_remove_device() */ static int scsi_remove_single_device(uint host, uint channel, uint id, uint lun) { struct scsi_device *sdev; struct Scsi_Host *shost; int error = -ENXIO; shost = scsi_host_lookup(host); if (!shost) return error; sdev = scsi_device_lookup(shost, channel, id, lun); if (sdev) { scsi_remove_device(sdev); scsi_device_put(sdev); error = 0; } scsi_host_put(shost); return error; } /** * proc_scsi_write - handle writes to /proc/scsi/scsi * @file: not used * @buf: buffer to write * @length: length of buf, at most PAGE_SIZE * @ppos: not used * * Description: this provides a legacy mechanism to add or remove devices by * Host, Channel, ID, and Lun. To use, * "echo 'scsi add-single-device 0 1 2 3' > /proc/scsi/scsi" or * "echo 'scsi remove-single-device 0 1 2 3' > /proc/scsi/scsi" with * "0 1 2 3" replaced by the Host, Channel, Id, and Lun. * * Note: this seems to be aimed at parallel SCSI. Most modern busses (USB, * SATA, Firewire, Fibre Channel, etc) dynamically assign these values to * provide a unique identifier and nothing more. */ static ssize_t proc_scsi_write(struct file *file, const char __user *buf, size_t length, loff_t *ppos) { int host, channel, id, lun; char *buffer, *end, *p; int err; if (!buf || length > PAGE_SIZE) return -EINVAL; buffer = (char *)__get_free_page(GFP_KERNEL); if (!buffer) return -ENOMEM; err = -EFAULT; if (copy_from_user(buffer, buf, length)) goto out; err = -EINVAL; if (length < PAGE_SIZE) { end = buffer + length; *end = '\0'; } else { end = buffer + PAGE_SIZE - 1; if (*end) goto out; } /* * Usage: echo "scsi add-single-device 0 1 2 3" >/proc/scsi/scsi * with "0 1 2 3" replaced by your "Host Channel Id Lun". */ if (!strncmp("scsi add-single-device", buffer, 22)) { p = buffer + 23; host = (p < end) ? simple_strtoul(p, &p, 0) : 0; channel = (p + 1 < end) ? simple_strtoul(p + 1, &p, 0) : 0; id = (p + 1 < end) ? simple_strtoul(p + 1, &p, 0) : 0; lun = (p + 1 < end) ? simple_strtoul(p + 1, &p, 0) : 0; err = scsi_add_single_device(host, channel, id, lun); /* * Usage: echo "scsi remove-single-device 0 1 2 3" >/proc/scsi/scsi * with "0 1 2 3" replaced by your "Host Channel Id Lun". */ } else if (!strncmp("scsi remove-single-device", buffer, 25)) { p = buffer + 26; host = (p < end) ? simple_strtoul(p, &p, 0) : 0; channel = (p + 1 < end) ? simple_strtoul(p + 1, &p, 0) : 0; id = (p + 1 < end) ? simple_strtoul(p + 1, &p, 0) : 0; lun = (p + 1 < end) ? simple_strtoul(p + 1, &p, 0) : 0; err = scsi_remove_single_device(host, channel, id, lun); } /* * convert success returns so that we return the * number of bytes consumed. */ if (!err) err = length; out: free_page((unsigned long)buffer); return err; } static inline struct device *next_scsi_device(struct device *start) { struct device *next = bus_find_next_device(&scsi_bus_type, start); put_device(start); return next; } static void *scsi_seq_start(struct seq_file *sfile, loff_t *pos) { struct device *dev = NULL; loff_t n = *pos; while ((dev = next_scsi_device(dev))) { if (!n--) break; sfile->private++; } return dev; } static void *scsi_seq_next(struct seq_file *sfile, void *v, loff_t *pos) { (*pos)++; sfile->private++; return next_scsi_device(v); } static void scsi_seq_stop(struct seq_file *sfile, void *v) { put_device(v); } static int scsi_seq_show(struct seq_file *sfile, void *dev) { if (!sfile->private) seq_puts(sfile, "Attached devices:\n"); return proc_print_scsidevice(dev, sfile); } static const struct seq_operations scsi_seq_ops = { .start = scsi_seq_start, .next = scsi_seq_next, .stop = scsi_seq_stop, .show = scsi_seq_show }; /** * proc_scsi_open - glue function * @inode: not used * @file: passed to single_open() * * Associates proc_scsi_show with this file */ static int proc_scsi_open(struct inode *inode, struct file *file) { /* * We don't really need this for the write case but it doesn't * harm either. */ return seq_open(file, &scsi_seq_ops); } static const struct proc_ops scsi_scsi_proc_ops = { .proc_open = proc_scsi_open, .proc_read = seq_read, .proc_write = proc_scsi_write, .proc_lseek = seq_lseek, .proc_release = seq_release, }; /** * scsi_init_procfs - create scsi and scsi/scsi in procfs */ int __init scsi_init_procfs(void) { struct proc_dir_entry *pde; proc_scsi = proc_mkdir("scsi", NULL); if (!proc_scsi) goto err1; pde = proc_create("scsi/scsi", 0, NULL, &scsi_scsi_proc_ops); if (!pde) goto err2; return 0; err2: remove_proc_entry("scsi", NULL); err1: return -ENOMEM; } /** * scsi_exit_procfs - Remove scsi/scsi and scsi from procfs */ void scsi_exit_procfs(void) { remove_proc_entry("scsi/scsi", NULL); remove_proc_entry("scsi", NULL); }
4 4 4 4 4 4 4 4 4 4 4 4 3 2 1 1 1 1 1 3 4 4 4 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 // SPDX-License-Identifier: MIT /* * vgaarb.c: Implements VGA arbitration. For details refer to * Documentation/gpu/vgaarbiter.rst * * (C) Copyright 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org> * (C) Copyright 2007 Paulo R. Zanoni <przanoni@gmail.com> * (C) Copyright 2007, 2009 Tiago Vignatti <vignatti@freedesktop.org> */ #define pr_fmt(fmt) "vgaarb: " fmt #define vgaarb_dbg(dev, fmt, arg...) dev_dbg(dev, "vgaarb: " fmt, ##arg) #define vgaarb_info(dev, fmt, arg...) dev_info(dev, "vgaarb: " fmt, ##arg) #define vgaarb_err(dev, fmt, arg...) dev_err(dev, "vgaarb: " fmt, ##arg) #include <linux/module.h> #include <linux/kernel.h> #include <linux/pci.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/list.h> #include <linux/sched/signal.h> #include <linux/wait.h> #include <linux/spinlock.h> #include <linux/poll.h> #include <linux/miscdevice.h> #include <linux/slab.h> #include <linux/screen_info.h> #include <linux/vt.h> #include <linux/console.h> #include <linux/acpi.h> #include <linux/uaccess.h> #include <linux/vgaarb.h> static void vga_arbiter_notify_clients(void); /* * We keep a list of all VGA devices in the system to speed * up the various operations of the arbiter */ struct vga_device { struct list_head list; struct pci_dev *pdev; unsigned int decodes; /* what it decodes */ unsigned int owns; /* what it owns */ unsigned int locks; /* what it locks */ unsigned int io_lock_cnt; /* legacy IO lock count */ unsigned int mem_lock_cnt; /* legacy MEM lock count */ unsigned int io_norm_cnt; /* normal IO count */ unsigned int mem_norm_cnt; /* normal MEM count */ bool bridge_has_one_vga; bool is_firmware_default; /* device selected by firmware */ unsigned int (*set_decode)(struct pci_dev *pdev, bool decode); }; static LIST_HEAD(vga_list); static int vga_count, vga_decode_count; static bool vga_arbiter_used; static DEFINE_SPINLOCK(vga_lock); static DECLARE_WAIT_QUEUE_HEAD(vga_wait_queue); static const char *vga_iostate_to_str(unsigned int iostate) { /* Ignore VGA_RSRC_IO and VGA_RSRC_MEM */ iostate &= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; switch (iostate) { case VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM: return "io+mem"; case VGA_RSRC_LEGACY_IO: return "io"; case VGA_RSRC_LEGACY_MEM: return "mem"; } return "none"; } static int vga_str_to_iostate(char *buf, int str_size, unsigned int *io_state) { /* * In theory, we could hand out locks on IO and MEM separately to * userspace, but this can cause deadlocks. */ if (strncmp(buf, "none", 4) == 0) { *io_state = VGA_RSRC_NONE; return 1; } /* XXX We're not checking the str_size! */ if (strncmp(buf, "io+mem", 6) == 0) goto both; else if (strncmp(buf, "io", 2) == 0) goto both; else if (strncmp(buf, "mem", 3) == 0) goto both; return 0; both: *io_state = VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; return 1; } /* This is only used as a cookie, it should not be dereferenced */ static struct pci_dev *vga_default; /* Find somebody in our list */ static struct vga_device *vgadev_find(struct pci_dev *pdev) { struct vga_device *vgadev; list_for_each_entry(vgadev, &vga_list, list) if (pdev == vgadev->pdev) return vgadev; return NULL; } /** * vga_default_device - return the default VGA device, for vgacon * * This can be defined by the platform. The default implementation is * rather dumb and will probably only work properly on single VGA card * setups and/or x86 platforms. * * If your VGA default device is not PCI, you'll have to return NULL here. * In this case, I assume it will not conflict with any PCI card. If this * is not true, I'll have to define two arch hooks for enabling/disabling * the VGA default device if that is possible. This may be a problem with * real _ISA_ VGA cards, in addition to a PCI one. I don't know at this * point how to deal with that card. Can their IOs be disabled at all? If * not, then I suppose it's a matter of having the proper arch hook telling * us about it, so we basically never allow anybody to succeed a vga_get(). */ struct pci_dev *vga_default_device(void) { return vga_default; } EXPORT_SYMBOL_GPL(vga_default_device); void vga_set_default_device(struct pci_dev *pdev) { if (vga_default == pdev) return; pci_dev_put(vga_default); vga_default = pci_dev_get(pdev); } /** * vga_remove_vgacon - deactivate VGA console * * Unbind and unregister vgacon in case pdev is the default VGA device. * Can be called by GPU drivers on initialization to make sure VGA register * access done by vgacon will not disturb the device. * * @pdev: PCI device. */ #if !defined(CONFIG_VGA_CONSOLE) int vga_remove_vgacon(struct pci_dev *pdev) { return 0; } #elif !defined(CONFIG_DUMMY_CONSOLE) int vga_remove_vgacon(struct pci_dev *pdev) { return -ENODEV; } #else int vga_remove_vgacon(struct pci_dev *pdev) { int ret = 0; if (pdev != vga_default) return 0; vgaarb_info(&pdev->dev, "deactivate vga console\n"); console_lock(); if (con_is_bound(&vga_con)) ret = do_take_over_console(&dummy_con, 0, MAX_NR_CONSOLES - 1, 1); if (ret == 0) { ret = do_unregister_con_driver(&vga_con); /* Ignore "already unregistered". */ if (ret == -ENODEV) ret = 0; } console_unlock(); return ret; } #endif EXPORT_SYMBOL(vga_remove_vgacon); /* * If we don't ever use VGA arbitration, we should avoid turning off * anything anywhere due to old X servers getting confused about the boot * device not being VGA. */ static void vga_check_first_use(void) { /* * Inform all GPUs in the system that VGA arbitration has occurred * so they can disable resources if possible. */ if (!vga_arbiter_used) { vga_arbiter_used = true; vga_arbiter_notify_clients(); } } static struct vga_device *__vga_tryget(struct vga_device *vgadev, unsigned int rsrc) { struct device *dev = &vgadev->pdev->dev; unsigned int wants, legacy_wants, match; struct vga_device *conflict; unsigned int pci_bits; u32 flags = 0; /* * Account for "normal" resources to lock. If we decode the legacy, * counterpart, we need to request it as well */ if ((rsrc & VGA_RSRC_NORMAL_IO) && (vgadev->decodes & VGA_RSRC_LEGACY_IO)) rsrc |= VGA_RSRC_LEGACY_IO; if ((rsrc & VGA_RSRC_NORMAL_MEM) && (vgadev->decodes & VGA_RSRC_LEGACY_MEM)) rsrc |= VGA_RSRC_LEGACY_MEM; vgaarb_dbg(dev, "%s: %d\n", __func__, rsrc); vgaarb_dbg(dev, "%s: owns: %d\n", __func__, vgadev->owns); /* Check what resources we need to acquire */ wants = rsrc & ~vgadev->owns; /* We already own everything, just mark locked & bye bye */ if (wants == 0) goto lock_them; /* * We don't need to request a legacy resource, we just enable * appropriate decoding and go. */ legacy_wants = wants & VGA_RSRC_LEGACY_MASK; if (legacy_wants == 0) goto enable_them; /* Ok, we don't, let's find out who we need to kick off */ list_for_each_entry(conflict, &vga_list, list) { unsigned int lwants = legacy_wants; unsigned int change_bridge = 0; /* Don't conflict with myself */ if (vgadev == conflict) continue; /* * We have a possible conflict. Before we go further, we must * check if we sit on the same bus as the conflicting device. * If we don't, then we must tie both IO and MEM resources * together since there is only a single bit controlling * VGA forwarding on P2P bridges. */ if (vgadev->pdev->bus != conflict->pdev->bus) { change_bridge = 1; lwants = VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; } /* * Check if the guy has a lock on the resource. If he does, * return the conflicting entry. */ if (conflict->locks & lwants) return conflict; /* * Ok, now check if it owns the resource we want. We can * lock resources that are not decoded; therefore a device * can own resources it doesn't decode. */ match = lwants & conflict->owns; if (!match) continue; /* * Looks like he doesn't have a lock, we can steal them * from him. */ flags = 0; pci_bits = 0; /* * If we can't control legacy resources via the bridge, we * also need to disable normal decoding. */ if (!conflict->bridge_has_one_vga) { if ((match & conflict->decodes) & VGA_RSRC_LEGACY_MEM) pci_bits |= PCI_COMMAND_MEMORY; if ((match & conflict->decodes) & VGA_RSRC_LEGACY_IO) pci_bits |= PCI_COMMAND_IO; if (pci_bits) flags |= PCI_VGA_STATE_CHANGE_DECODES; } if (change_bridge) flags |= PCI_VGA_STATE_CHANGE_BRIDGE; pci_set_vga_state(conflict->pdev, false, pci_bits, flags); conflict->owns &= ~match; /* If we disabled normal decoding, reflect it in owns */ if (pci_bits & PCI_COMMAND_MEMORY) conflict->owns &= ~VGA_RSRC_NORMAL_MEM; if (pci_bits & PCI_COMMAND_IO) conflict->owns &= ~VGA_RSRC_NORMAL_IO; } enable_them: /* * Ok, we got it, everybody conflicting has been disabled, let's * enable us. Mark any bits in "owns" regardless of whether we * decoded them. We can lock resources we don't decode, therefore * we must track them via "owns". */ flags = 0; pci_bits = 0; if (!vgadev->bridge_has_one_vga) { flags |= PCI_VGA_STATE_CHANGE_DECODES; if (wants & (VGA_RSRC_LEGACY_MEM|VGA_RSRC_NORMAL_MEM)) pci_bits |= PCI_COMMAND_MEMORY; if (wants & (VGA_RSRC_LEGACY_IO|VGA_RSRC_NORMAL_IO)) pci_bits |= PCI_COMMAND_IO; } if (wants & VGA_RSRC_LEGACY_MASK) flags |= PCI_VGA_STATE_CHANGE_BRIDGE; pci_set_vga_state(vgadev->pdev, true, pci_bits, flags); vgadev->owns |= wants; lock_them: vgadev->locks |= (rsrc & VGA_RSRC_LEGACY_MASK); if (rsrc & VGA_RSRC_LEGACY_IO) vgadev->io_lock_cnt++; if (rsrc & VGA_RSRC_LEGACY_MEM) vgadev->mem_lock_cnt++; if (rsrc & VGA_RSRC_NORMAL_IO) vgadev->io_norm_cnt++; if (rsrc & VGA_RSRC_NORMAL_MEM) vgadev->mem_norm_cnt++; return NULL; } static void __vga_put(struct vga_device *vgadev, unsigned int rsrc) { struct device *dev = &vgadev->pdev->dev; unsigned int old_locks = vgadev->locks; vgaarb_dbg(dev, "%s\n", __func__); /* * Update our counters and account for equivalent legacy resources * if we decode them. */ if ((rsrc & VGA_RSRC_NORMAL_IO) && vgadev->io_norm_cnt > 0) { vgadev->io_norm_cnt--; if (vgadev->decodes & VGA_RSRC_LEGACY_IO) rsrc |= VGA_RSRC_LEGACY_IO; } if ((rsrc & VGA_RSRC_NORMAL_MEM) && vgadev->mem_norm_cnt > 0) { vgadev->mem_norm_cnt--; if (vgadev->decodes & VGA_RSRC_LEGACY_MEM) rsrc |= VGA_RSRC_LEGACY_MEM; } if ((rsrc & VGA_RSRC_LEGACY_IO) && vgadev->io_lock_cnt > 0) vgadev->io_lock_cnt--; if ((rsrc & VGA_RSRC_LEGACY_MEM) && vgadev->mem_lock_cnt > 0) vgadev->mem_lock_cnt--; /* * Just clear lock bits, we do lazy operations so we don't really * have to bother about anything else at this point. */ if (vgadev->io_lock_cnt == 0) vgadev->locks &= ~VGA_RSRC_LEGACY_IO; if (vgadev->mem_lock_cnt == 0) vgadev->locks &= ~VGA_RSRC_LEGACY_MEM; /* * Kick the wait queue in case somebody was waiting if we actually * released something. */ if (old_locks != vgadev->locks) wake_up_all(&vga_wait_queue); } /** * vga_get - acquire & lock VGA resources * @pdev: PCI device of the VGA card or NULL for the system default * @rsrc: bit mask of resources to acquire and lock * @interruptible: blocking should be interruptible by signals ? * * Acquire VGA resources for the given card and mark those resources * locked. If the resources requested are "normal" (and not legacy) * resources, the arbiter will first check whether the card is doing legacy * decoding for that type of resource. If yes, the lock is "converted" into * a legacy resource lock. * * The arbiter will first look for all VGA cards that might conflict and disable * their IOs and/or Memory access, including VGA forwarding on P2P bridges if * necessary, so that the requested resources can be used. Then, the card is * marked as locking these resources and the IO and/or Memory accesses are * enabled on the card (including VGA forwarding on parent P2P bridges if any). * * This function will block if some conflicting card is already locking one of * the required resources (or any resource on a different bus segment, since P2P * bridges don't differentiate VGA memory and IO afaik). You can indicate * whether this blocking should be interruptible by a signal (for userland * interface) or not. * * Must not be called at interrupt time or in atomic context. If the card * already owns the resources, the function succeeds. Nested calls are * supported (a per-resource counter is maintained) * * On success, release the VGA resource again with vga_put(). * * Returns: * * 0 on success, negative error code on failure. */ int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible) { struct vga_device *vgadev, *conflict; unsigned long flags; wait_queue_entry_t wait; int rc = 0; vga_check_first_use(); /* The caller should check for this, but let's be sure */ if (pdev == NULL) pdev = vga_default_device(); if (pdev == NULL) return 0; for (;;) { spin_lock_irqsave(&vga_lock, flags); vgadev = vgadev_find(pdev); if (vgadev == NULL) { spin_unlock_irqrestore(&vga_lock, flags); rc = -ENODEV; break; } conflict = __vga_tryget(vgadev, rsrc); spin_unlock_irqrestore(&vga_lock, flags); if (conflict == NULL) break; /* * We have a conflict; we wait until somebody kicks the * work queue. Currently we have one work queue that we * kick each time some resources are released, but it would * be fairly easy to have a per-device one so that we only * need to attach to the conflicting device. */ init_waitqueue_entry(&wait, current); add_wait_queue(&vga_wait_queue, &wait); set_current_state(interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (interruptible && signal_pending(current)) { __set_current_state(TASK_RUNNING); remove_wait_queue(&vga_wait_queue, &wait); rc = -ERESTARTSYS; break; } schedule(); remove_wait_queue(&vga_wait_queue, &wait); } return rc; } EXPORT_SYMBOL(vga_get); /** * vga_tryget - try to acquire & lock legacy VGA resources * @pdev: PCI device of VGA card or NULL for system default * @rsrc: bit mask of resources to acquire and lock * * Perform the same operation as vga_get(), but return an error (-EBUSY) * instead of blocking if the resources are already locked by another card. * Can be called in any context. * * On success, release the VGA resource again with vga_put(). * * Returns: * * 0 on success, negative error code on failure. */ static int vga_tryget(struct pci_dev *pdev, unsigned int rsrc) { struct vga_device *vgadev; unsigned long flags; int rc = 0; vga_check_first_use(); /* The caller should check for this, but let's be sure */ if (pdev == NULL) pdev = vga_default_device(); if (pdev == NULL) return 0; spin_lock_irqsave(&vga_lock, flags); vgadev = vgadev_find(pdev); if (vgadev == NULL) { rc = -ENODEV; goto bail; } if (__vga_tryget(vgadev, rsrc)) rc = -EBUSY; bail: spin_unlock_irqrestore(&vga_lock, flags); return rc; } /** * vga_put - release lock on legacy VGA resources * @pdev: PCI device of VGA card or NULL for system default * @rsrc: bit mask of resource to release * * Release resources previously locked by vga_get() or vga_tryget(). The * resources aren't disabled right away, so that a subsequent vga_get() on * the same card will succeed immediately. Resources have a counter, so * locks are only released if the counter reaches 0. */ void vga_put(struct pci_dev *pdev, unsigned int rsrc) { struct vga_device *vgadev; unsigned long flags; /* The caller should check for this, but let's be sure */ if (pdev == NULL) pdev = vga_default_device(); if (pdev == NULL) return; spin_lock_irqsave(&vga_lock, flags); vgadev = vgadev_find(pdev); if (vgadev == NULL) goto bail; __vga_put(vgadev, rsrc); bail: spin_unlock_irqrestore(&vga_lock, flags); } EXPORT_SYMBOL(vga_put); static bool vga_is_firmware_default(struct pci_dev *pdev) { #if defined CONFIG_X86 return pdev == screen_info_pci_dev(&screen_info); #else return false; #endif } static bool vga_arb_integrated_gpu(struct device *dev) { #if defined(CONFIG_ACPI) struct acpi_device *adev = ACPI_COMPANION(dev); return adev && !strcmp(acpi_device_hid(adev), ACPI_VIDEO_HID); #else return false; #endif } /* * Return true if vgadev is a better default VGA device than the best one * we've seen so far. */ static bool vga_is_boot_device(struct vga_device *vgadev) { struct vga_device *boot_vga = vgadev_find(vga_default_device()); struct pci_dev *pdev = vgadev->pdev; u16 cmd, boot_cmd; /* * We select the default VGA device in this order: * Firmware framebuffer (see vga_arb_select_default_device()) * Legacy VGA device (owns VGA_RSRC_LEGACY_MASK) * Non-legacy integrated device (see vga_arb_select_default_device()) * Non-legacy discrete device (see vga_arb_select_default_device()) * Other device (see vga_arb_select_default_device()) */ /* * We always prefer a firmware default device, so if we've already * found one, there's no need to consider vgadev. */ if (boot_vga && boot_vga->is_firmware_default) return false; if (vga_is_firmware_default(pdev)) { vgadev->is_firmware_default = true; return true; } /* * A legacy VGA device has MEM and IO enabled and any bridges * leading to it have PCI_BRIDGE_CTL_VGA enabled so the legacy * resources ([mem 0xa0000-0xbffff], [io 0x3b0-0x3bb], etc) are * routed to it. * * We use the first one we find, so if we've already found one, * vgadev is no better. */ if (boot_vga && (boot_vga->owns & VGA_RSRC_LEGACY_MASK) == VGA_RSRC_LEGACY_MASK) return false; if ((vgadev->owns & VGA_RSRC_LEGACY_MASK) == VGA_RSRC_LEGACY_MASK) return true; /* * If we haven't found a legacy VGA device, accept a non-legacy * device. It may have either IO or MEM enabled, and bridges may * not have PCI_BRIDGE_CTL_VGA enabled, so it may not be able to * use legacy VGA resources. Prefer an integrated GPU over others. */ pci_read_config_word(pdev, PCI_COMMAND, &cmd); if (cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { /* * An integrated GPU overrides a previous non-legacy * device. We expect only a single integrated GPU, but if * there are more, we use the *last* because that was the * previous behavior. */ if (vga_arb_integrated_gpu(&pdev->dev)) return true; /* * We prefer the first non-legacy discrete device we find. * If we already found one, vgadev is no better. */ if (boot_vga) { pci_read_config_word(boot_vga->pdev, PCI_COMMAND, &boot_cmd); if (boot_cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) return false; } return true; } /* * Vgadev has neither IO nor MEM enabled. If we haven't found any * other VGA devices, it is the best candidate so far. */ if (!boot_vga) return true; return false; } /* * Rules for using a bridge to control a VGA descendant decoding: if a bridge * has only one VGA descendant then it can be used to control the VGA routing * for that device. It should always use the bridge closest to the device to * control it. If a bridge has a direct VGA descendant, but also have a sub- * bridge VGA descendant then we cannot use that bridge to control the direct * VGA descendant. So for every device we register, we need to iterate all * its parent bridges so we can invalidate any devices using them properly. */ static void vga_arbiter_check_bridge_sharing(struct vga_device *vgadev) { struct vga_device *same_bridge_vgadev; struct pci_bus *new_bus, *bus; struct pci_dev *new_bridge, *bridge; vgadev->bridge_has_one_vga = true; if (list_empty(&vga_list)) { vgaarb_info(&vgadev->pdev->dev, "bridge control possible\n"); return; } /* Iterate the new device's bridge hierarchy */ new_bus = vgadev->pdev->bus; while (new_bus) { new_bridge = new_bus->self; /* Go through list of devices already registered */ list_for_each_entry(same_bridge_vgadev, &vga_list, list) { bus = same_bridge_vgadev->pdev->bus; bridge = bus->self; /* See if it shares a bridge with this device */ if (new_bridge == bridge) { /* * If its direct parent bridge is the same * as any bridge of this device then it can't * be used for that device. */ same_bridge_vgadev->bridge_has_one_vga = false; } /* * Now iterate the previous device's bridge hierarchy. * If the new device's parent bridge is in the other * device's hierarchy, we can't use it to control this * device. */ while (bus) { bridge = bus->self; if (bridge && bridge == vgadev->pdev->bus->self) vgadev->bridge_has_one_vga = false; bus = bus->parent; } } new_bus = new_bus->parent; } if (vgadev->bridge_has_one_vga) vgaarb_info(&vgadev->pdev->dev, "bridge control possible\n"); else vgaarb_info(&vgadev->pdev->dev, "no bridge control possible\n"); } /* * Currently, we assume that the "initial" setup of the system is not sane, * that is, we come up with conflicting devices and let the arbiter's * client decide if devices decodes legacy things or not. */ static bool vga_arbiter_add_pci_device(struct pci_dev *pdev) { struct vga_device *vgadev; unsigned long flags; struct pci_bus *bus; struct pci_dev *bridge; u16 cmd; /* Allocate structure */ vgadev = kzalloc(sizeof(struct vga_device), GFP_KERNEL); if (vgadev == NULL) { vgaarb_err(&pdev->dev, "failed to allocate VGA arbiter data\n"); /* * What to do on allocation failure? For now, let's just do * nothing, I'm not sure there is anything saner to be done. */ return false; } /* Take lock & check for duplicates */ spin_lock_irqsave(&vga_lock, flags); if (vgadev_find(pdev) != NULL) { BUG_ON(1); goto fail; } vgadev->pdev = pdev; /* By default, assume we decode everything */ vgadev->decodes = VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; /* By default, mark it as decoding */ vga_decode_count++; /* * Mark that we "own" resources based on our enables, we will * clear that below if the bridge isn't forwarding. */ pci_read_config_word(pdev, PCI_COMMAND, &cmd); if (cmd & PCI_COMMAND_IO) vgadev->owns |= VGA_RSRC_LEGACY_IO; if (cmd & PCI_COMMAND_MEMORY) vgadev->owns |= VGA_RSRC_LEGACY_MEM; /* Check if VGA cycles can get down to us */ bus = pdev->bus; while (bus) { bridge = bus->self; if (bridge) { u16 l; pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, &l); if (!(l & PCI_BRIDGE_CTL_VGA)) { vgadev->owns = 0; break; } } bus = bus->parent; } if (vga_is_boot_device(vgadev)) { vgaarb_info(&pdev->dev, "setting as boot VGA device%s\n", vga_default_device() ? " (overriding previous)" : ""); vga_set_default_device(pdev); } vga_arbiter_check_bridge_sharing(vgadev); /* Add to the list */ list_add_tail(&vgadev->list, &vga_list); vga_count++; vgaarb_info(&pdev->dev, "VGA device added: decodes=%s,owns=%s,locks=%s\n", vga_iostate_to_str(vgadev->decodes), vga_iostate_to_str(vgadev->owns), vga_iostate_to_str(vgadev->locks)); spin_unlock_irqrestore(&vga_lock, flags); return true; fail: spin_unlock_irqrestore(&vga_lock, flags); kfree(vgadev); return false; } static bool vga_arbiter_del_pci_device(struct pci_dev *pdev) { struct vga_device *vgadev; unsigned long flags; bool ret = true; spin_lock_irqsave(&vga_lock, flags); vgadev = vgadev_find(pdev); if (vgadev == NULL) { ret = false; goto bail; } if (vga_default == pdev) vga_set_default_device(NULL); if (vgadev->decodes & (VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM)) vga_decode_count--; /* Remove entry from list */ list_del(&vgadev->list); vga_count--; /* Wake up all possible waiters */ wake_up_all(&vga_wait_queue); bail: spin_unlock_irqrestore(&vga_lock, flags); kfree(vgadev); return ret; } /* Called with the lock */ static void vga_update_device_decodes(struct vga_device *vgadev, unsigned int new_decodes) { struct device *dev = &vgadev->pdev->dev; unsigned int old_decodes = vgadev->decodes; unsigned int decodes_removed = ~new_decodes & old_decodes; unsigned int decodes_unlocked = vgadev->locks & decodes_removed; vgadev->decodes = new_decodes; vgaarb_info(dev, "VGA decodes changed: olddecodes=%s,decodes=%s:owns=%s\n", vga_iostate_to_str(old_decodes), vga_iostate_to_str(vgadev->decodes), vga_iostate_to_str(vgadev->owns)); /* If we removed locked decodes, lock count goes to zero, and release */ if (decodes_unlocked) { if (decodes_unlocked & VGA_RSRC_LEGACY_IO) vgadev->io_lock_cnt = 0; if (decodes_unlocked & VGA_RSRC_LEGACY_MEM) vgadev->mem_lock_cnt = 0; __vga_put(vgadev, decodes_unlocked); } /* Change decodes counter */ if (old_decodes & VGA_RSRC_LEGACY_MASK && !(new_decodes & VGA_RSRC_LEGACY_MASK)) vga_decode_count--; if (!(old_decodes & VGA_RSRC_LEGACY_MASK) && new_decodes & VGA_RSRC_LEGACY_MASK) vga_decode_count++; vgaarb_dbg(dev, "decoding count now is: %d\n", vga_decode_count); } static void __vga_set_legacy_decoding(struct pci_dev *pdev, unsigned int decodes, bool userspace) { struct vga_device *vgadev; unsigned long flags; decodes &= VGA_RSRC_LEGACY_MASK; spin_lock_irqsave(&vga_lock, flags); vgadev = vgadev_find(pdev); if (vgadev == NULL) goto bail; /* Don't let userspace futz with kernel driver decodes */ if (userspace && vgadev->set_decode) goto bail; /* Update the device decodes + counter */ vga_update_device_decodes(vgadev, decodes); /* * XXX If somebody is going from "doesn't decode" to "decodes" * state here, additional care must be taken as we may have pending * ownership of non-legacy region. */ bail: spin_unlock_irqrestore(&vga_lock, flags); } /** * vga_set_legacy_decoding * @pdev: PCI device of the VGA card * @decodes: bit mask of what legacy regions the card decodes * * Indicate to the arbiter if the card decodes legacy VGA IOs, legacy VGA * Memory, both, or none. All cards default to both, the card driver (fbdev for * example) should tell the arbiter if it has disabled legacy decoding, so the * card can be left out of the arbitration process (and can be safe to take * interrupts at any time. */ void vga_set_legacy_decoding(struct pci_dev *pdev, unsigned int decodes) { __vga_set_legacy_decoding(pdev, decodes, false); } EXPORT_SYMBOL(vga_set_legacy_decoding); /** * vga_client_register - register or unregister a VGA arbitration client * @pdev: PCI device of the VGA client * @set_decode: VGA decode change callback * * Clients have two callback mechanisms they can use. * * @set_decode callback: If a client can disable its GPU VGA resource, it * will get a callback from this to set the encode/decode state. * * Rationale: we cannot disable VGA decode resources unconditionally * because some single GPU laptops seem to require ACPI or BIOS access to * the VGA registers to control things like backlights etc. Hopefully newer * multi-GPU laptops do something saner, and desktops won't have any * special ACPI for this. The driver will get a callback when VGA * arbitration is first used by userspace since some older X servers have * issues. * * Does not check whether a client for @pdev has been registered already. * * To unregister, call vga_client_unregister(). * * Returns: 0 on success, -ENODEV on failure */ int vga_client_register(struct pci_dev *pdev, unsigned int (*set_decode)(struct pci_dev *pdev, bool decode)) { unsigned long flags; struct vga_device *vgadev; spin_lock_irqsave(&vga_lock, flags); vgadev = vgadev_find(pdev); if (vgadev) vgadev->set_decode = set_decode; spin_unlock_irqrestore(&vga_lock, flags); if (!vgadev) return -ENODEV; return 0; } EXPORT_SYMBOL(vga_client_register); /* * Char driver implementation * * Semantics is: * * open : Open user instance of the arbiter. By default, it's * attached to the default VGA device of the system. * * close : Close user instance, release locks * * read : Return a string indicating the status of the target. * An IO state string is of the form {io,mem,io+mem,none}, * mc and ic are respectively mem and io lock counts (for * debugging/diagnostic only). "decodes" indicate what the * card currently decodes, "owns" indicates what is currently * enabled on it, and "locks" indicates what is locked by this * card. If the card is unplugged, we get "invalid" then for * card_ID and an -ENODEV error is returned for any command * until a new card is targeted * * "<card_ID>,decodes=<io_state>,owns=<io_state>,locks=<io_state> (ic,mc)" * * write : write a command to the arbiter. List of commands is: * * target <card_ID> : switch target to card <card_ID> (see below) * lock <io_state> : acquire locks on target ("none" is invalid io_state) * trylock <io_state> : non-blocking acquire locks on target * unlock <io_state> : release locks on target * unlock all : release all locks on target held by this user * decodes <io_state> : set the legacy decoding attributes for the card * * poll : event if something change on any card (not just the target) * * card_ID is of the form "PCI:domain:bus:dev.fn". It can be set to "default" * to go back to the system default card (TODO: not implemented yet). * Currently, only PCI is supported as a prefix, but the userland API may * support other bus types in the future, even if the current kernel * implementation doesn't. * * Note about locks: * * The driver keeps track of which user has what locks on which card. It * supports stacking, like the kernel one. This complicates the implementation * a bit, but makes the arbiter more tolerant to userspace problems and able * to properly cleanup in all cases when a process dies. * Currently, a max of 16 cards simultaneously can have locks issued from * userspace for a given user (file descriptor instance) of the arbiter. * * If the device is hot-unplugged, there is a hook inside the module to notify * it being added/removed in the system and automatically added/removed in * the arbiter. */ #define MAX_USER_CARDS CONFIG_VGA_ARB_MAX_GPUS #define PCI_INVALID_CARD ((struct pci_dev *)-1UL) /* Each user has an array of these, tracking which cards have locks */ struct vga_arb_user_card { struct pci_dev *pdev; unsigned int mem_cnt; unsigned int io_cnt; }; struct vga_arb_private { struct list_head list; struct pci_dev *target; struct vga_arb_user_card cards[MAX_USER_CARDS]; spinlock_t lock; }; static LIST_HEAD(vga_user_list); static DEFINE_SPINLOCK(vga_user_lock); /* * Take a string in the format: "PCI:domain:bus:dev.fn" and return the * respective values. If the string is not in this format, return 0. */ static int vga_pci_str_to_vars(char *buf, int count, unsigned int *domain, unsigned int *bus, unsigned int *devfn) { int n; unsigned int slot, func; n = sscanf(buf, "PCI:%x:%x:%x.%x", domain, bus, &slot, &func); if (n != 4) return 0; *devfn = PCI_DEVFN(slot, func); return 1; } static ssize_t vga_arb_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct vga_arb_private *priv = file->private_data; struct vga_device *vgadev; struct pci_dev *pdev; unsigned long flags; size_t len; int rc; char *lbuf; lbuf = kmalloc(1024, GFP_KERNEL); if (lbuf == NULL) return -ENOMEM; /* Protect vga_list */ spin_lock_irqsave(&vga_lock, flags); /* If we are targeting the default, use it */ pdev = priv->target; if (pdev == NULL || pdev == PCI_INVALID_CARD) { spin_unlock_irqrestore(&vga_lock, flags); len = sprintf(lbuf, "invalid"); goto done; } /* Find card vgadev structure */ vgadev = vgadev_find(pdev); if (vgadev == NULL) { /* * Wow, it's not in the list, that shouldn't happen, let's * fix us up and return invalid card. */ spin_unlock_irqrestore(&vga_lock, flags); len = sprintf(lbuf, "invalid"); goto done; } /* Fill the buffer with info */ len = snprintf(lbuf, 1024, "count:%d,PCI:%s,decodes=%s,owns=%s,locks=%s(%u:%u)\n", vga_decode_count, pci_name(pdev), vga_iostate_to_str(vgadev->decodes), vga_iostate_to_str(vgadev->owns), vga_iostate_to_str(vgadev->locks), vgadev->io_lock_cnt, vgadev->mem_lock_cnt); spin_unlock_irqrestore(&vga_lock, flags); done: /* Copy that to user */ if (len > count) len = count; rc = copy_to_user(buf, lbuf, len); kfree(lbuf); if (rc) return -EFAULT; return len; } /* * TODO: To avoid parsing inside kernel and to improve the speed we may * consider use ioctl here */ static ssize_t vga_arb_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct vga_arb_private *priv = file->private_data; struct vga_arb_user_card *uc = NULL; struct pci_dev *pdev; unsigned int io_state; char kbuf[64], *curr_pos; size_t remaining = count; int ret_val; int i; if (count >= sizeof(kbuf)) return -EINVAL; if (copy_from_user(kbuf, buf, count)) return -EFAULT; curr_pos = kbuf; kbuf[count] = '\0'; if (strncmp(curr_pos, "lock ", 5) == 0) { curr_pos += 5; remaining -= 5; pr_debug("client 0x%p called 'lock'\n", priv); if (!vga_str_to_iostate(curr_pos, remaining, &io_state)) { ret_val = -EPROTO; goto done; } if (io_state == VGA_RSRC_NONE) { ret_val = -EPROTO; goto done; } pdev = priv->target; if (priv->target == NULL) { ret_val = -ENODEV; goto done; } vga_get_uninterruptible(pdev, io_state); /* Update the client's locks lists */ for (i = 0; i < MAX_USER_CARDS; i++) { if (priv->cards[i].pdev == pdev) { if (io_state & VGA_RSRC_LEGACY_IO) priv->cards[i].io_cnt++; if (io_state & VGA_RSRC_LEGACY_MEM) priv->cards[i].mem_cnt++; break; } } ret_val = count; goto done; } else if (strncmp(curr_pos, "unlock ", 7) == 0) { curr_pos += 7; remaining -= 7; pr_debug("client 0x%p called 'unlock'\n", priv); if (strncmp(curr_pos, "all", 3) == 0) io_state = VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; else { if (!vga_str_to_iostate (curr_pos, remaining, &io_state)) { ret_val = -EPROTO; goto done; } /* TODO: Add this? if (io_state == VGA_RSRC_NONE) { ret_val = -EPROTO; goto done; } */ } pdev = priv->target; if (priv->target == NULL) { ret_val = -ENODEV; goto done; } for (i = 0; i < MAX_USER_CARDS; i++) { if (priv->cards[i].pdev == pdev) uc = &priv->cards[i]; } if (!uc) { ret_val = -EINVAL; goto done; } if (io_state & VGA_RSRC_LEGACY_IO && uc->io_cnt == 0) { ret_val = -EINVAL; goto done; } if (io_state & VGA_RSRC_LEGACY_MEM && uc->mem_cnt == 0) { ret_val = -EINVAL; goto done; } vga_put(pdev, io_state); if (io_state & VGA_RSRC_LEGACY_IO) uc->io_cnt--; if (io_state & VGA_RSRC_LEGACY_MEM) uc->mem_cnt--; ret_val = count; goto done; } else if (strncmp(curr_pos, "trylock ", 8) == 0) { curr_pos += 8; remaining -= 8; pr_debug("client 0x%p called 'trylock'\n", priv); if (!vga_str_to_iostate(curr_pos, remaining, &io_state)) { ret_val = -EPROTO; goto done; } /* TODO: Add this? if (io_state == VGA_RSRC_NONE) { ret_val = -EPROTO; goto done; } */ pdev = priv->target; if (priv->target == NULL) { ret_val = -ENODEV; goto done; } if (vga_tryget(pdev, io_state)) { /* Update the client's locks lists... */ for (i = 0; i < MAX_USER_CARDS; i++) { if (priv->cards[i].pdev == pdev) { if (io_state & VGA_RSRC_LEGACY_IO) priv->cards[i].io_cnt++; if (io_state & VGA_RSRC_LEGACY_MEM) priv->cards[i].mem_cnt++; break; } } ret_val = count; goto done; } else { ret_val = -EBUSY; goto done; } } else if (strncmp(curr_pos, "target ", 7) == 0) { unsigned int domain, bus, devfn; struct vga_device *vgadev; curr_pos += 7; remaining -= 7; pr_debug("client 0x%p called 'target'\n", priv); /* If target is default */ if (!strncmp(curr_pos, "default", 7)) pdev = pci_dev_get(vga_default_device()); else { if (!vga_pci_str_to_vars(curr_pos, remaining, &domain, &bus, &devfn)) { ret_val = -EPROTO; goto done; } pdev = pci_get_domain_bus_and_slot(domain, bus, devfn); if (!pdev) { pr_debug("invalid PCI address %04x:%02x:%02x.%x\n", domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret_val = -ENODEV; goto done; } pr_debug("%s ==> %04x:%02x:%02x.%x pdev %p\n", curr_pos, domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pdev); } vgadev = vgadev_find(pdev); pr_debug("vgadev %p\n", vgadev); if (vgadev == NULL) { if (pdev) { vgaarb_dbg(&pdev->dev, "not a VGA device\n"); pci_dev_put(pdev); } ret_val = -ENODEV; goto done; } priv->target = pdev; for (i = 0; i < MAX_USER_CARDS; i++) { if (priv->cards[i].pdev == pdev) break; if (priv->cards[i].pdev == NULL) { priv->cards[i].pdev = pdev; priv->cards[i].io_cnt = 0; priv->cards[i].mem_cnt = 0; break; } } if (i == MAX_USER_CARDS) { vgaarb_dbg(&pdev->dev, "maximum user cards (%d) number reached, ignoring this one!\n", MAX_USER_CARDS); pci_dev_put(pdev); /* XXX: Which value to return? */ ret_val = -ENOMEM; goto done; } ret_val = count; pci_dev_put(pdev); goto done; } else if (strncmp(curr_pos, "decodes ", 8) == 0) { curr_pos += 8; remaining -= 8; pr_debug("client 0x%p called 'decodes'\n", priv); if (!vga_str_to_iostate(curr_pos, remaining, &io_state)) { ret_val = -EPROTO; goto done; } pdev = priv->target; if (priv->target == NULL) { ret_val = -ENODEV; goto done; } __vga_set_legacy_decoding(pdev, io_state, true); ret_val = count; goto done; } /* If we got here, the message written is not part of the protocol! */ return -EPROTO; done: return ret_val; } static __poll_t vga_arb_fpoll(struct file *file, poll_table *wait) { pr_debug("%s\n", __func__); poll_wait(file, &vga_wait_queue, wait); return EPOLLIN; } static int vga_arb_open(struct inode *inode, struct file *file) { struct vga_arb_private *priv; unsigned long flags; pr_debug("%s\n", __func__); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv == NULL) return -ENOMEM; spin_lock_init(&priv->lock); file->private_data = priv; spin_lock_irqsave(&vga_user_lock, flags); list_add(&priv->list, &vga_user_list); spin_unlock_irqrestore(&vga_user_lock, flags); /* Set the client's lists of locks */ priv->target = vga_default_device(); /* Maybe this is still null! */ priv->cards[0].pdev = priv->target; priv->cards[0].io_cnt = 0; priv->cards[0].mem_cnt = 0; return 0; } static int vga_arb_release(struct inode *inode, struct file *file) { struct vga_arb_private *priv = file->private_data; struct vga_arb_user_card *uc; unsigned long flags; int i; pr_debug("%s\n", __func__); spin_lock_irqsave(&vga_user_lock, flags); list_del(&priv->list); for (i = 0; i < MAX_USER_CARDS; i++) { uc = &priv->cards[i]; if (uc->pdev == NULL) continue; vgaarb_dbg(&uc->pdev->dev, "uc->io_cnt == %d, uc->mem_cnt == %d\n", uc->io_cnt, uc->mem_cnt); while (uc->io_cnt--) vga_put(uc->pdev, VGA_RSRC_LEGACY_IO); while (uc->mem_cnt--) vga_put(uc->pdev, VGA_RSRC_LEGACY_MEM); } spin_unlock_irqrestore(&vga_user_lock, flags); kfree(priv); return 0; } /* * Callback any registered clients to let them know we have a change in VGA * cards. */ static void vga_arbiter_notify_clients(void) { struct vga_device *vgadev; unsigned long flags; unsigned int new_decodes; bool new_state; if (!vga_arbiter_used) return; new_state = (vga_count > 1) ? false : true; spin_lock_irqsave(&vga_lock, flags); list_for_each_entry(vgadev, &vga_list, list) { if (vgadev->set_decode) { new_decodes = vgadev->set_decode(vgadev->pdev, new_state); vga_update_device_decodes(vgadev, new_decodes); } } spin_unlock_irqrestore(&vga_lock, flags); } static int pci_notify(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; struct pci_dev *pdev = to_pci_dev(dev); bool notify = false; vgaarb_dbg(dev, "%s\n", __func__); /* Only deal with VGA class devices */ if (!pci_is_vga(pdev)) return 0; /* * For now, we're only interested in devices added and removed. * I didn't test this thing here, so someone needs to double check * for the cases of hot-pluggable VGA cards. */ if (action == BUS_NOTIFY_ADD_DEVICE) notify = vga_arbiter_add_pci_device(pdev); else if (action == BUS_NOTIFY_DEL_DEVICE) notify = vga_arbiter_del_pci_device(pdev); if (notify) vga_arbiter_notify_clients(); return 0; } static struct notifier_block pci_notifier = { .notifier_call = pci_notify, }; static const struct file_operations vga_arb_device_fops = { .read = vga_arb_read, .write = vga_arb_write, .poll = vga_arb_fpoll, .open = vga_arb_open, .release = vga_arb_release, .llseek = noop_llseek, }; static struct miscdevice vga_arb_device = { MISC_DYNAMIC_MINOR, "vga_arbiter", &vga_arb_device_fops }; static int __init vga_arb_device_init(void) { int rc; struct pci_dev *pdev; rc = misc_register(&vga_arb_device); if (rc < 0) pr_err("error %d registering device\n", rc); bus_register_notifier(&pci_bus_type, &pci_notifier); /* Add all VGA class PCI devices by default */ pdev = NULL; while ((pdev = pci_get_subsys(PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL) { if (pci_is_vga(pdev)) vga_arbiter_add_pci_device(pdev); } pr_info("loaded\n"); return rc; } subsys_initcall_sync(vga_arb_device_init);
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C), 2008-2021, OPPO Mobile Comm Corp., Ltd. * https://www.oppo.com/ */ #include <linux/sysfs.h> #include <linux/kobject.h> #include "internal.h" #include "compress.h" enum { attr_feature, attr_drop_caches, attr_pointer_ui, attr_pointer_bool, attr_accel, }; enum { struct_erofs_sb_info, struct_erofs_mount_opts, }; struct erofs_attr { struct attribute attr; short attr_id; int struct_type, offset; }; #define EROFS_ATTR(_name, _mode, _id) \ static struct erofs_attr erofs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .attr_id = attr_##_id, \ } #define EROFS_ATTR_FUNC(_name, _mode) EROFS_ATTR(_name, _mode, _name) #define EROFS_ATTR_FEATURE(_name) EROFS_ATTR(_name, 0444, feature) #define EROFS_ATTR_OFFSET(_name, _mode, _id, _struct) \ static struct erofs_attr erofs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .attr_id = attr_##_id, \ .struct_type = struct_##_struct, \ .offset = offsetof(struct _struct, _name),\ } #define EROFS_ATTR_RW(_name, _id, _struct) \ EROFS_ATTR_OFFSET(_name, 0644, _id, _struct) #define EROFS_RO_ATTR(_name, _id, _struct) \ EROFS_ATTR_OFFSET(_name, 0444, _id, _struct) #define EROFS_ATTR_RW_UI(_name, _struct) \ EROFS_ATTR_RW(_name, pointer_ui, _struct) #define EROFS_ATTR_RW_BOOL(_name, _struct) \ EROFS_ATTR_RW(_name, pointer_bool, _struct) #define ATTR_LIST(name) (&erofs_attr_##name.attr) #ifdef CONFIG_EROFS_FS_ZIP EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts); EROFS_ATTR_FUNC(drop_caches, 0200); #endif #ifdef CONFIG_EROFS_FS_ZIP_ACCEL EROFS_ATTR_FUNC(accel, 0644); #endif EROFS_ATTR_RW_UI(dir_ra_bytes, erofs_sb_info); static struct attribute *erofs_sb_attrs[] = { #ifdef CONFIG_EROFS_FS_ZIP ATTR_LIST(sync_decompress), ATTR_LIST(drop_caches), #endif ATTR_LIST(dir_ra_bytes), NULL, }; ATTRIBUTE_GROUPS(erofs_sb); static struct attribute *erofs_attrs[] = { #ifdef CONFIG_EROFS_FS_ZIP_ACCEL ATTR_LIST(accel), #endif NULL, }; ATTRIBUTE_GROUPS(erofs); /* Features this copy of erofs supports */ EROFS_ATTR_FEATURE(zero_padding); EROFS_ATTR_FEATURE(compr_cfgs); EROFS_ATTR_FEATURE(big_pcluster); EROFS_ATTR_FEATURE(chunked_file); EROFS_ATTR_FEATURE(device_table); EROFS_ATTR_FEATURE(compr_head2); EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); EROFS_ATTR_FEATURE(fragments); EROFS_ATTR_FEATURE(dedupe); EROFS_ATTR_FEATURE(48bit); EROFS_ATTR_FEATURE(metabox); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), ATTR_LIST(compr_cfgs), ATTR_LIST(big_pcluster), ATTR_LIST(chunked_file), ATTR_LIST(device_table), ATTR_LIST(compr_head2), ATTR_LIST(sb_chksum), ATTR_LIST(ztailpacking), ATTR_LIST(fragments), ATTR_LIST(dedupe), ATTR_LIST(48bit), ATTR_LIST(metabox), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); static unsigned char *__struct_ptr(struct erofs_sb_info *sbi, int struct_type, int offset) { if (struct_type == struct_erofs_sb_info) return (unsigned char *)sbi + offset; if (struct_type == struct_erofs_mount_opts) return (unsigned char *)&sbi->opt + offset; return NULL; } static ssize_t erofs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, s_kobj); struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); switch (a->attr_id) { case attr_feature: return sysfs_emit(buf, "supported\n"); case attr_pointer_ui: if (!ptr) return 0; return sysfs_emit(buf, "%u\n", *(unsigned int *)ptr); case attr_pointer_bool: if (!ptr) return 0; return sysfs_emit(buf, "%d\n", *(bool *)ptr); case attr_accel: return z_erofs_crypto_show_engines(buf, PAGE_SIZE, '\n'); } return 0; } static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, s_kobj); struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); unsigned long t; int ret; switch (a->attr_id) { case attr_pointer_ui: if (!ptr) return 0; ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret) return ret; if (t != (unsigned int)t) return -ERANGE; #ifdef CONFIG_EROFS_FS_ZIP if (!strcmp(a->attr.name, "sync_decompress") && (t > EROFS_SYNC_DECOMPRESS_FORCE_OFF)) return -EINVAL; #endif *(unsigned int *)ptr = t; return len; case attr_pointer_bool: if (!ptr) return 0; ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret) return ret; if (t != 0 && t != 1) return -EINVAL; *(bool *)ptr = !!t; return len; #ifdef CONFIG_EROFS_FS_ZIP case attr_drop_caches: ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret) return ret; if (t < 1 || t > 3) return -EINVAL; if (t & 2) z_erofs_shrink_scan(sbi, ~0UL); if (t & 1) invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1); return len; #endif #ifdef CONFIG_EROFS_FS_ZIP_ACCEL case attr_accel: buf = skip_spaces(buf); z_erofs_crypto_disable_all_engines(); while (*buf) { t = strcspn(buf, "\n"); ret = z_erofs_crypto_enable_engine(buf, t); if (ret < 0) return ret; buf += buf[t] != '\0' ? t + 1 : t; } return len; #endif } return 0; } static void erofs_sb_release(struct kobject *kobj) { struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, s_kobj); complete(&sbi->s_kobj_unregister); } static const struct sysfs_ops erofs_attr_ops = { .show = erofs_attr_show, .store = erofs_attr_store, }; static const struct kobj_type erofs_sb_ktype = { .default_groups = erofs_sb_groups, .sysfs_ops = &erofs_attr_ops, .release = erofs_sb_release, }; static const struct kobj_type erofs_ktype = { .default_groups = erofs_groups, .sysfs_ops = &erofs_attr_ops, }; static struct kset erofs_root = { .kobj = {.ktype = &erofs_ktype}, }; static const struct kobj_type erofs_feat_ktype = { .default_groups = erofs_feat_groups, .sysfs_ops = &erofs_attr_ops, }; static struct kobject erofs_feat = { .kset = &erofs_root, }; int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); int err; sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", sb->s_sysfs_name); if (err) { kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); } return err; } void erofs_unregister_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); if (sbi->s_kobj.state_in_sysfs) { kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); } } void erofs_exit_sysfs(void) { kobject_put(&erofs_feat); kset_unregister(&erofs_root); } int __init erofs_init_sysfs(void) { int ret; kobject_set_name(&erofs_root.kobj, "erofs"); erofs_root.kobj.parent = fs_kobj; ret = kset_register(&erofs_root); if (!ret) { ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype, NULL, "features"); if (!ret) return 0; erofs_exit_sysfs(); } return ret; }
2936 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* SPDX-License-Identifier: GPL-2.0 */ /* * * Definitions for mount interface. This describes the in the kernel build * linkedlist with mounted filesystems. * * Author: Marco van Wieringen <mvw@planets.elm.net> * */ #ifndef _LINUX_MOUNT_H #define _LINUX_MOUNT_H #include <linux/types.h> #include <asm/barrier.h> struct super_block; struct dentry; struct user_namespace; struct mnt_idmap; struct file_system_type; struct fs_context; struct file; struct path; enum mount_flags { MNT_NOSUID = 0x01, MNT_NODEV = 0x02, MNT_NOEXEC = 0x04, MNT_NOATIME = 0x08, MNT_NODIRATIME = 0x10, MNT_RELATIME = 0x20, MNT_READONLY = 0x40, /* does the user want this to be r/o? */ MNT_NOSYMFOLLOW = 0x80, MNT_SHRINKABLE = 0x100, MNT_INTERNAL = 0x4000, MNT_LOCK_ATIME = 0x040000, MNT_LOCK_NOEXEC = 0x080000, MNT_LOCK_NOSUID = 0x100000, MNT_LOCK_NODEV = 0x200000, MNT_LOCK_READONLY = 0x400000, MNT_LOCKED = 0x800000, MNT_DOOMED = 0x1000000, MNT_SYNC_UMOUNT = 0x2000000, MNT_UMOUNT = 0x8000000, MNT_USER_SETTABLE_MASK = MNT_NOSUID | MNT_NODEV | MNT_NOEXEC | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME | MNT_READONLY | MNT_NOSYMFOLLOW, MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME, MNT_INTERNAL_FLAGS = MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_LOCKED }; struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; struct mnt_idmap *mnt_idmap; } __randomize_layout; static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ return READ_ONCE(mnt->mnt_idmap); } extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); extern void mnt_drop_write(struct vfsmount *mnt); extern void mnt_drop_write_file(struct file *file); extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern void mnt_make_shortterm(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(const struct path *path); extern bool __mnt_is_readonly(const struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); int mnt_get_write_access(struct vfsmount *mnt); void mnt_put_write_access(struct vfsmount *mnt); extern struct vfsmount *fc_mount(struct fs_context *fc); extern struct vfsmount *fc_mount_longterm(struct fs_context *fc); extern struct vfsmount *vfs_create_mount(struct fs_context *fc); extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data); extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); extern bool path_is_mountpoint(const struct path *path); extern bool our_mnt(struct vfsmount *mnt); extern struct vfsmount *kern_mount(struct file_system_type *); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern const struct path *collect_paths(const struct path *, struct path *, unsigned); extern void drop_collected_paths(const struct path *, const struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); #endif /* _LINUX_MOUNT_H */
53 30 48 42 6 41 41 41 41 28 41 47 2 1 1 1 1 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 /* * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/in.h> #include <linux/ipv6.h> #include "rds.h" #include "loop.h" static char * const rds_trans_modules[] = { [RDS_TRANS_IB] = "rds_rdma", [RDS_TRANS_GAP] = NULL, [RDS_TRANS_TCP] = "rds_tcp", }; static struct rds_transport *transports[RDS_TRANS_COUNT]; static DECLARE_RWSEM(rds_trans_sem); void rds_trans_register(struct rds_transport *trans) { BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); down_write(&rds_trans_sem); if (transports[trans->t_type]) printk(KERN_ERR "RDS Transport type %d already registered\n", trans->t_type); else { transports[trans->t_type] = trans; printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name); } up_write(&rds_trans_sem); } EXPORT_SYMBOL_GPL(rds_trans_register); void rds_trans_unregister(struct rds_transport *trans) { down_write(&rds_trans_sem); transports[trans->t_type] = NULL; printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name); up_write(&rds_trans_sem); } EXPORT_SYMBOL_GPL(rds_trans_unregister); void rds_trans_put(struct rds_transport *trans) { if (trans) module_put(trans->t_owner); } struct rds_transport *rds_trans_get_preferred(struct net *net, const struct in6_addr *addr, __u32 scope_id) { struct rds_transport *ret = NULL; struct rds_transport *trans; unsigned int i; if (ipv6_addr_v4mapped(addr)) { if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET) return &rds_loop_transport; } else if (ipv6_addr_loopback(addr)) { return &rds_loop_transport; } down_read(&rds_trans_sem); for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; if (trans && (trans->laddr_check(net, addr, scope_id) == 0) && (!trans->t_owner || try_module_get(trans->t_owner))) { ret = trans; break; } } up_read(&rds_trans_sem); return ret; } struct rds_transport *rds_trans_get(int t_type) { struct rds_transport *ret = NULL; struct rds_transport *trans; down_read(&rds_trans_sem); trans = transports[t_type]; if (!trans) { up_read(&rds_trans_sem); if (rds_trans_modules[t_type]) request_module(rds_trans_modules[t_type]); down_read(&rds_trans_sem); trans = transports[t_type]; } if (trans && trans->t_type == t_type && (!trans->t_owner || try_module_get(trans->t_owner))) ret = trans; up_read(&rds_trans_sem); return ret; } /* * This returns the number of stats entries in the snapshot and only * copies them using the iter if there is enough space for them. The * caller passes in the global stats so that we can size and copy while * holding the lock. */ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail) { struct rds_transport *trans; unsigned int total = 0; unsigned int part; int i; rds_info_iter_unmap(iter); down_read(&rds_trans_sem); for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; if (!trans || !trans->stats_info_copy) continue; part = trans->stats_info_copy(iter, avail); avail -= min(avail, part); total += part; } up_read(&rds_trans_sem); return total; }
1 2 3 9 27 1 3 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Kernel-based Virtual Machine driver for Linux * * This header defines architecture specific interfaces, x86 version */ #ifndef _ASM_X86_KVM_HOST_H #define _ASM_X86_KVM_HOST_H #include <linux/types.h> #include <linux/mm.h> #include <linux/mmu_notifier.h> #include <linux/tracepoint.h> #include <linux/cpumask.h> #include <linux/irq_work.h> #include <linux/irq.h> #include <linux/workqueue.h> #include <linux/kvm.h> #include <linux/kvm_para.h> #include <linux/kvm_types.h> #include <linux/perf_event.h> #include <linux/pvclock_gtod.h> #include <linux/clocksource.h> #include <linux/irqbypass.h> #include <linux/kfifo.h> #include <linux/sched/vhost_task.h> #include <linux/call_once.h> #include <linux/atomic.h> #include <asm/apic.h> #include <asm/pvclock-abi.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/mtrr.h> #include <asm/msr-index.h> #include <asm/msr.h> #include <asm/asm.h> #include <asm/irq_remapping.h> #include <asm/kvm_page_track.h> #include <asm/kvm_vcpu_regs.h> #include <asm/reboot.h> #include <hyperv/hvhdk.h> #define __KVM_HAVE_ARCH_VCPU_DEBUGFS /* * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS). */ #ifdef CONFIG_KVM_MAX_NR_VCPUS #define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS #else #define KVM_MAX_VCPUS 1024 #endif /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs * might be larger than the actual number of VCPUs because the * APIC ID encodes CPU topology information. * * In the worst case, we'll need less than one extra bit for the * Core ID, and less than one extra bit for the Package (Die) ID, * so ratio of 4 should be enough. */ #define KVM_VCPU_ID_RATIO 4 #define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) /* memory slots that are not exposed to userspace */ #define KVM_INTERNAL_MEM_SLOTS 3 #define KVM_HALT_POLL_NS_DEFAULT 200000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) #define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ KVM_BUS_LOCK_DETECTION_EXIT) #define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \ KVM_X86_NOTIFY_VMEXIT_USER) /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) #define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) #define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) #define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) #define KVM_REQ_LOAD_MMU_PGD KVM_ARCH_REQ(5) #define KVM_REQ_EVENT KVM_ARCH_REQ(6) #define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) #define KVM_REQ_NMI KVM_ARCH_REQ(9) #define KVM_REQ_PMU KVM_ARCH_REQ(10) #define KVM_REQ_PMI KVM_ARCH_REQ(11) #ifdef CONFIG_KVM_SMM #define KVM_REQ_SMI KVM_ARCH_REQ(12) #endif #define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) #define KVM_REQ_MCLOCK_INPROGRESS \ KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_SCAN_IOAPIC \ KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16) #define KVM_REQ_APIC_PAGE_RELOAD \ KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_CRASH KVM_ARCH_REQ(18) #define KVM_REQ_IOAPIC_EOI_EXIT KVM_ARCH_REQ(19) #define KVM_REQ_HV_RESET KVM_ARCH_REQ(20) #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) #define KVM_REQ_GET_NESTED_STATE_PAGES KVM_ARCH_REQ(24) #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) #define KVM_REQ_TLB_FLUSH_GUEST \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define KVM_REQ_RECALC_INTERCEPTS KVM_ARCH_REQ(29) #define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \ KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE \ KVM_ARCH_REQ_FLAGS(34, KVM_REQUEST_WAIT) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ | X86_CR4_LAM_SUP | X86_CR4_CET)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) /* KVM Hugepage definitions for x86 */ #define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G #define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1) #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) #define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50 #define KVM_MIN_ALLOC_MMU_PAGES 64UL #define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 256 #define KVM_NR_VAR_MTRR 8 #define ASYNC_PF_PER_VCPU 64 enum kvm_reg { VCPU_REGS_RAX = __VCPU_REGS_RAX, VCPU_REGS_RCX = __VCPU_REGS_RCX, VCPU_REGS_RDX = __VCPU_REGS_RDX, VCPU_REGS_RBX = __VCPU_REGS_RBX, VCPU_REGS_RSP = __VCPU_REGS_RSP, VCPU_REGS_RBP = __VCPU_REGS_RBP, VCPU_REGS_RSI = __VCPU_REGS_RSI, VCPU_REGS_RDI = __VCPU_REGS_RDI, #ifdef CONFIG_X86_64 VCPU_REGS_R8 = __VCPU_REGS_R8, VCPU_REGS_R9 = __VCPU_REGS_R9, VCPU_REGS_R10 = __VCPU_REGS_R10, VCPU_REGS_R11 = __VCPU_REGS_R11, VCPU_REGS_R12 = __VCPU_REGS_R12, VCPU_REGS_R13 = __VCPU_REGS_R13, VCPU_REGS_R14 = __VCPU_REGS_R14, VCPU_REGS_R15 = __VCPU_REGS_R15, #endif VCPU_REGS_RIP, NR_VCPU_REGS, VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR0, VCPU_EXREG_CR3, VCPU_EXREG_CR4, VCPU_EXREG_RFLAGS, VCPU_EXREG_SEGMENTS, VCPU_EXREG_EXIT_INFO_1, VCPU_EXREG_EXIT_INFO_2, }; enum { VCPU_SREG_ES, VCPU_SREG_CS, VCPU_SREG_SS, VCPU_SREG_DS, VCPU_SREG_FS, VCPU_SREG_GS, VCPU_SREG_TR, VCPU_SREG_LDTR, }; enum exit_fastpath_completion { EXIT_FASTPATH_NONE, EXIT_FASTPATH_REENTER_GUEST, EXIT_FASTPATH_EXIT_HANDLED, EXIT_FASTPATH_EXIT_USERSPACE, }; typedef enum exit_fastpath_completion fastpath_t; struct x86_emulate_ctxt; struct x86_exception; union kvm_smram; enum x86_intercept; enum x86_intercept_stage; #define KVM_NR_DB_REGS 4 #define DR6_BUS_LOCK (1 << 11) #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) #define DR6_BT (1 << 15) #define DR6_RTM (1 << 16) /* * DR6_ACTIVE_LOW combines fixed-1 and active-low bits. * We can regard all the bits in DR6_FIXED_1 as active_low bits; * they will never be 0 for now, but when they are defined * in the future it will require no code change. * * DR6_ACTIVE_LOW is also used as the init/reset value for DR6. */ #define DR6_ACTIVE_LOW 0xffff0ff0 #define DR6_VOLATILE 0x0001e80f #define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE) #define DR7_BP_EN_MASK 0x000000ff #define DR7_GE (1 << 9) #define DR7_GD (1 << 13) #define DR7_VOLATILE 0xffff2bff #define KVM_GUESTDBG_VALID_MASK \ (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_SINGLESTEP | \ KVM_GUESTDBG_USE_HW_BP | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_INJECT_BP | \ KVM_GUESTDBG_INJECT_DB | \ KVM_GUESTDBG_BLOCKIRQ) #define PFERR_PRESENT_MASK BIT(0) #define PFERR_WRITE_MASK BIT(1) #define PFERR_USER_MASK BIT(2) #define PFERR_RSVD_MASK BIT(3) #define PFERR_FETCH_MASK BIT(4) #define PFERR_PK_MASK BIT(5) #define PFERR_SS_MASK BIT(6) #define PFERR_SGX_MASK BIT(15) #define PFERR_GUEST_RMP_MASK BIT_ULL(31) #define PFERR_GUEST_FINAL_MASK BIT_ULL(32) #define PFERR_GUEST_PAGE_MASK BIT_ULL(33) #define PFERR_GUEST_ENC_MASK BIT_ULL(34) #define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) #define PFERR_GUEST_VMPL_MASK BIT_ULL(36) /* * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks * when emulating instructions that triggers implicit access. */ #define PFERR_IMPLICIT_ACCESS BIT_ULL(48) /* * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred * when the guest was accessing private memory. */ #define PFERR_PRIVATE_ACCESS BIT_ULL(49) #define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* * The following bit is set with PV-EOI, unset on EOI. * We detect PV-EOI changes by guest by comparing * this bit with PV-EOI in guest memory. * See the implementation in apic_update_pv_eoi. */ #define KVM_APIC_PV_EOI_PENDING 1 struct kvm_kernel_irqfd; struct kvm_kernel_irq_routing_entry; /* * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in * the given MMU context. This is a subset of the overall kvm_cpu_role to * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows * allocating 2 bytes per gfn instead of 4 bytes per gfn. * * Upper-level shadow pages having gptes are tracked for write-protection via * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must * not create more than 2^16-1 upper-level shadow pages at a single gfn, * otherwise gfn_write_track will overflow and explosions will ensue. * * A unique shadow page (SP) for a gfn is created if and only if an existing SP * cannot be reused. The ability to reuse a SP is tracked by its role, which * incorporates various mode bits and properties of the SP. Roughly speaking, * the number of unique SPs that can theoretically be created is 2^n, where n * is the number of bits that are used to compute the role. * * But, even though there are 20 bits in the mask below, not all combinations * of modes and flags are possible: * * - invalid shadow pages are not accounted, mirror pages are not shadowed, * so the bits are effectively 18. * * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging); * execonly and ad_disabled are only used for nested EPT which has * has_4_byte_gpte=0. Therefore, 2 bits are always unused. * * - the 4 bits of level are effectively limited to the values 2/3/4/5, * as 4k SPs are not tracked (allowed to go unsync). In addition non-PAE * paging has exactly one upper level, making level completely redundant * when has_4_byte_gpte=1. * * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if * cr0_wp=0, therefore these three bits only give rise to 5 possibilities. * * Therefore, the maximum number of possible upper-level shadow pages for a * single gfn is a bit less than 2^13. */ union kvm_mmu_page_role { u32 word; struct { unsigned level:4; unsigned has_4_byte_gpte:1; unsigned quadrant:2; unsigned direct:1; unsigned access:3; unsigned invalid:1; unsigned efer_nx:1; unsigned cr0_wp:1; unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; unsigned ad_disabled:1; unsigned guest_mode:1; unsigned passthrough:1; unsigned is_mirror:1; unsigned :4; /* * This is left at the top of the word so that * kvm_memslots_for_spte_role can extract it with a * simple shift. While there is room, give it a whole * byte so it is also faster to load it from memory. */ unsigned smm:8; }; }; /* * kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties * relevant to the current MMU configuration. When loading CR0, CR4, or EFER, * including on nested transitions, if nothing in the full role changes then * MMU re-configuration can be skipped. @valid bit is set on first usage so we * don't treat all-zero structure as valid data. * * The properties that are tracked in the extended role but not the page role * are for things that either (a) do not affect the validity of the shadow page * or (b) are indirectly reflected in the shadow page's role. For example, * CR4.PKE only affects permission checks for software walks of the guest page * tables (because KVM doesn't support Protection Keys with shadow paging), and * CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level. * * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role. * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and * SMAP, but the MMU's permission checks for software walks need to be SMEP and * SMAP aware regardless of CR0.WP. */ union kvm_mmu_extended_role { u32 word; struct { unsigned int valid:1; unsigned int execonly:1; unsigned int cr4_pse:1; unsigned int cr4_pke:1; unsigned int cr4_smap:1; unsigned int cr4_smep:1; unsigned int cr4_la57:1; unsigned int efer_lma:1; }; }; union kvm_cpu_role { u64 as_u64; struct { union kvm_mmu_page_role base; union kvm_mmu_extended_role ext; }; }; struct kvm_rmap_head { atomic_long_t val; }; struct kvm_pio_request { unsigned long count; int in; int port; int size; }; #define PT64_ROOT_MAX_LEVEL 5 struct rsvd_bits_validate { u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL]; u64 bad_mt_xwr; }; struct kvm_mmu_root_info { gpa_t pgd; hpa_t hpa; }; #define KVM_MMU_ROOT_INFO_INVALID \ ((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE }) #define KVM_MMU_NUM_PREV_ROOTS 3 #define KVM_MMU_ROOT_CURRENT BIT(0) #define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) #define KVM_MMU_ROOTS_ALL (BIT(1 + KVM_MMU_NUM_PREV_ROOTS) - 1) #define KVM_HAVE_MMU_RWLOCK struct kvm_mmu_page; struct kvm_page_fault; /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the * current mmu mode. */ struct kvm_mmu { unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t gva_or_gpa, u64 access, struct x86_exception *exception); int (*sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i); struct kvm_mmu_root_info root; hpa_t mirror_root_hpa; union kvm_cpu_role cpu_role; union kvm_mmu_page_role root_role; /* * The pkru_mask indicates if protection key checks are needed. It * consists of 16 domains indexed by page fault error code bits [4:1], * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables. * Each domain has 2 bits which are ANDed with AD and WD from PKRU. */ u32 pkru_mask; struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; /* * Bitmap; bit set = permission fault * Byte index: page fault error code [4:1] * Bit index: pte permissions in ACC_* format */ u8 permissions[16]; u64 *pae_root; u64 *pml4_root; u64 *pml5_root; /* * check zero bits on shadow page table entries, these * bits include not only hardware reserved bits but also * the bits spte never used. */ struct rsvd_bits_validate shadow_zero_check; struct rsvd_bits_validate guest_rsvd_check; u64 pdptrs[4]; /* pae */ }; enum pmc_type { KVM_PMC_GP = 0, KVM_PMC_FIXED, }; struct kvm_pmc { enum pmc_type type; u8 idx; bool is_paused; bool intr; /* * Base value of the PMC counter, relative to the *consumed* count in * the associated perf_event. This value includes counter updates from * the perf_event and emulated_count since the last time the counter * was reprogrammed, but it is *not* the current value as seen by the * guest or userspace. * * The count is relative to the associated perf_event so that KVM * doesn't need to reprogram the perf_event every time the guest writes * to the counter. */ u64 counter; /* * PMC events triggered by KVM emulation that haven't been fully * processed, i.e. haven't undergone overflow detection. */ u64 emulated_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* * only for creating or reusing perf_event, * eventsel value for general purpose counters, * ctrl value for fixed counters. */ u64 current_config; }; /* More counters may conflict with other existing Architectural MSRs */ #define KVM_MAX(a, b) ((a) >= (b) ? (a) : (b)) #define KVM_MAX_NR_INTEL_GP_COUNTERS 8 #define KVM_MAX_NR_AMD_GP_COUNTERS 6 #define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \ KVM_MAX_NR_AMD_GP_COUNTERS) #define KVM_MAX_NR_INTEL_FIXED_COUNTERS 3 #define KVM_MAX_NR_AMD_FIXED_COUNTERS 0 #define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUNTERS, \ KVM_MAX_NR_AMD_FIXED_COUNTERS) struct kvm_pmu { u8 version; unsigned nr_arch_gp_counters; unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; u64 fixed_ctr_ctrl_rsvd; u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; u64 global_ctrl_rsvd; u64 global_status_rsvd; u64 reserved_bits; u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_MAX_NR_GP_COUNTERS]; struct kvm_pmc fixed_counters[KVM_MAX_NR_FIXED_COUNTERS]; /* * Overlay the bitmap with a 64-bit atomic so that all bits can be * set in a single access, e.g. to reprogram all counters when the PMU * filter changes. */ union { DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); atomic64_t __reprogram_pmi; }; DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_counting_instructions, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_counting_branches, X86_PMC_IDX_MAX); u64 ds_area; u64 pebs_enable; u64 pebs_enable_rsvd; u64 pebs_data_cfg; u64 pebs_data_cfg_rsvd; /* * If a guest counter is cross-mapped to host counter with different * index, its PEBS capability will be temporarily disabled. * * The user should make sure that this mask is updated * after disabling interrupts and before perf_guest_get_msrs(); */ u64 host_cross_mapped_mask; /* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice. */ bool need_cleanup; /* * The total number of programmed perf_events and it helps to avoid * redundant check before cleanup if guest don't use vPMU at all. */ u8 event_count; }; struct kvm_pmu_ops; enum { KVM_DEBUGREG_BP_ENABLED = BIT(0), KVM_DEBUGREG_WONT_EXIT = BIT(1), /* * Guest debug registers (DR0-3, DR6 and DR7) are saved/restored by * hardware on exit from or enter to guest. KVM needn't switch them. * DR0-3, DR6 and DR7 are set to their architectural INIT value on VM * exit, host values need to be restored. */ KVM_DEBUGREG_AUTO_SWITCH = BIT(2), }; struct kvm_mtrr { u64 var[KVM_NR_VAR_MTRR * 2]; u64 fixed_64k; u64 fixed_16k[2]; u64 fixed_4k[8]; u64 deftype; }; /* Hyper-V SynIC timer */ struct kvm_vcpu_hv_stimer { struct hrtimer timer; int index; union hv_stimer_config config; u64 count; u64 exp_time; struct hv_message msg; bool msg_pending; }; /* Hyper-V synthetic interrupt controller (SynIC)*/ struct kvm_vcpu_hv_synic { u64 version; u64 control; u64 msg_page; u64 evt_page; atomic64_t sint[HV_SYNIC_SINT_COUNT]; atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT]; DECLARE_BITMAP(auto_eoi_bitmap, 256); DECLARE_BITMAP(vec_bitmap, 256); bool active; bool dont_zero_synic_pages; }; /* The maximum number of entries on the TLB flush fifo. */ #define KVM_HV_TLB_FLUSH_FIFO_SIZE (16) /* * Note: the following 'magic' entry is made up by KVM to avoid putting * anything besides GVA on the TLB flush fifo. It is theoretically possible * to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000 * which will look identical. KVM's action to 'flush everything' instead of * flushing these particular addresses is, however, fully legitimate as * flushing more than requested is always OK. */ #define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1) enum hv_tlb_flush_fifos { HV_L1_TLB_FLUSH_FIFO, HV_L2_TLB_FLUSH_FIFO, HV_NR_TLB_FLUSH_FIFOS, }; struct kvm_vcpu_hv_tlb_flush_fifo { spinlock_t write_lock; DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE); }; /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { struct kvm_vcpu *vcpu; u32 vp_index; u64 hv_vapic; s64 runtime_offset; struct kvm_vcpu_hv_synic synic; struct kvm_hyperv_exit exit; struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); bool enforce_cpuid; struct { u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */ u32 features_ebx; /* HYPERV_CPUID_FEATURES.EBX */ u32 features_edx; /* HYPERV_CPUID_FEATURES.EDX */ u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */ u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */ u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */ u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */ } cpuid_cache; struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS]; /* * Preallocated buffers for handling hypercalls that pass sparse vCPU * sets (for high vCPU counts, they're too large to comfortably fit on * the stack). */ u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS]; DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); struct hv_vp_assist_page vp_assist_page; struct { u64 pa_page_gpa; u64 vm_id; u32 vp_id; } nested; }; struct kvm_hypervisor_cpuid { u32 base; u32 limit; }; #ifdef CONFIG_KVM_XEN /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; u32 current_runstate; u8 upcall_vector; struct gfn_to_pfn_cache vcpu_info_cache; struct gfn_to_pfn_cache vcpu_time_info_cache; struct gfn_to_pfn_cache runstate_cache; struct gfn_to_pfn_cache runstate2_cache; u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; unsigned long evtchn_pending_sel; u32 vcpu_id; /* The Xen / ACPI vCPU ID */ u32 timer_virq; u64 timer_expires; /* In guest epoch */ atomic_t timer_pending; struct hrtimer timer; int poll_evtchn; struct timer_list poll_timer; struct kvm_hypervisor_cpuid cpuid; }; #endif struct kvm_queued_exception { bool pending; bool injected; bool has_error_code; u8 vector; u32 error_code; unsigned long payload; bool has_payload; }; /* * Hardware-defined CPUID leafs that are either scattered by the kernel or are * unknown to the kernel, but need to be directly used by KVM. Note, these * word values conflict with the kernel's "bug" caps, but KVM doesn't use those. */ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, CPUID_7_1_EDX, CPUID_8000_0007_EDX, CPUID_8000_0022_EAX, CPUID_7_2_EDX, CPUID_24_0_EBX, CPUID_8000_0021_ECX, CPUID_7_1_ECX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, }; struct kvm_vcpu_arch { /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. */ unsigned long regs[NR_VCPU_REGS]; u32 regs_avail; u32 regs_dirty; unsigned long cr0; unsigned long cr0_guest_owned_bits; unsigned long cr2; unsigned long cr3; unsigned long cr4; unsigned long cr4_guest_owned_bits; unsigned long cr4_guest_rsvd_bits; unsigned long cr8; u32 host_pkru; u32 pkru; u32 hflags; u64 efer; u64 host_debugctl; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ bool load_eoi_exitmap_pending; DECLARE_BITMAP(ioapic_handled_vectors, 256); unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; u64 ia32_misc_enable_msr; u64 smbase; u64 smi_count; bool at_instruction_boundary; bool tpr_access_reporting; bool xfd_no_write_intercept; u64 microcode_version; u64 arch_capabilities; u64 perf_capabilities; /* * Paging state of the vcpu * * If the vcpu runs in guest mode with two level paging this still saves * the paging mode of the l1 guest. This context is always used to * handle faults. */ struct kvm_mmu *mmu; /* Non-nested MMU for L1 */ struct kvm_mmu root_mmu; /* L1 MMU when running nested */ struct kvm_mmu guest_mmu; /* * Paging state of an L2 guest (used for nested npt) * * This context will save all necessary information to walk page tables * of an L2 guest. This context is only initialized for page table * walking and not for faulting since we never handle l2 page faults on * the host. */ struct kvm_mmu nested_mmu; /* * Pointer to the mmu context currently used for * gva_to_gpa translations. */ struct kvm_mmu *walk_mmu; struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_shadow_page_cache; struct kvm_mmu_memory_cache mmu_shadowed_info_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; /* * This cache is to allocate external page table. E.g. private EPT used * by the TDX module. */ struct kvm_mmu_memory_cache mmu_external_spt_cache; /* * QEMU userspace and the guest each have their own FPU state. * In vcpu_run, we switch between the user and guest FPU contexts. * While running a VCPU, the VCPU thread will have the guest FPU * context. * * Note that while the PKRU state lives inside the fpu registers, * it is switched out separately at VMENTER and VMEXIT time. The * "guest_fpstate" state here contains the guest FPU context, with the * host PRKU bits. */ struct fpu_guest guest_fpu; u64 xcr0; u64 guest_supported_xcr0; u64 ia32_xss; u64 guest_supported_xss; struct kvm_pio_request pio; void *pio_data; void *sev_pio_data; unsigned sev_pio_count; u8 event_exit_inst_len; bool exception_from_userspace; /* Exceptions to be injected to the guest. */ struct kvm_queued_exception exception; /* Exception VM-Exits to be synthesized to L1. */ struct kvm_queued_exception exception_vmexit; struct kvm_queued_interrupt { bool injected; bool soft; u8 nr; } interrupt; int halt_request; /* real mode on Intel only */ int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; bool cpuid_dynamic_bits_dirty; bool is_amd_compatible; /* * cpu_caps holds the effective guest capabilities, i.e. the features * the vCPU is allowed to use. Typically, but not always, features can * be used by the guest if and only if both KVM and userspace want to * expose the feature to the guest. * * A common exception is for virtualization holes, i.e. when KVM can't * prevent the guest from using a feature, in which case the vCPU "has" * the feature regardless of what KVM or userspace desires. * * Note, features that don't require KVM involvement in any way are * NOT enforced/sanitized by KVM, i.e. are taken verbatim from the * guest CPUID provided by userspace. */ u32 cpu_caps[NR_KVM_CPU_CAPS]; u64 reserved_gpa_bits; int maxphyaddr; /* emulate context */ struct x86_emulate_ctxt *emulate_ctxt; bool emulate_regs_need_sync_to_vcpu; bool emulate_regs_need_sync_from_vcpu; int (*complete_userspace_io)(struct kvm_vcpu *vcpu); unsigned long cui_linear_rip; int cui_rdmsr_imm_reg; gpa_t time; s8 pvclock_tsc_shift; u32 pvclock_tsc_mul; unsigned int hw_tsc_khz; struct gfn_to_pfn_cache pv_time; /* set guest stopped flag in pvclock flags field */ bool pvclock_set_guest_stopped_request; struct { u8 preempted; u64 msr_val; u64 last_steal; struct gfn_to_hva_cache cache; } st; u64 l1_tsc_offset; u64 tsc_offset; /* current tsc offset */ u64 last_guest_tsc; u64 last_host_tsc; u64 tsc_offset_adjustment; u64 this_tsc_nsec; u64 this_tsc_write; u64 this_tsc_generation; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; u32 virtual_tsc_mult; u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; u64 msr_ia32_power_ctl; u64 l1_tsc_scaling_ratio; u64 tsc_scaling_ratio; /* current scaling ratio */ atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ /* Number of NMIs pending injection, not including hardware vNMIs. */ unsigned int nmi_pending; bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */ u8 handling_intr_from_guest; struct kvm_mtrr mtrr_state; u64 pat; unsigned switch_db_regs; unsigned long db[KVM_NR_DB_REGS]; unsigned long dr6; unsigned long dr7; unsigned long eff_db[KVM_NR_DB_REGS]; unsigned long guest_debug_dr7; u64 msr_platform_info; u64 msr_misc_features_enables; u64 mcg_cap; u64 mcg_status; u64 mcg_ctl; u64 mcg_ext_ctl; u64 *mce_banks; u64 *mci_ctl2_banks; /* Cache MMIO info */ u64 mmio_gva; unsigned mmio_access; gfn_t mmio_gfn; u64 mmio_gen; struct kvm_pmu pmu; /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; #ifdef CONFIG_KVM_HYPERV bool hyperv_enabled; struct kvm_vcpu_hv *hyperv; #endif #ifdef CONFIG_KVM_XEN struct kvm_vcpu_xen xen; #endif cpumask_var_t wbinvd_dirty_mask; unsigned long last_retry_eip; unsigned long last_retry_addr; struct { bool halted; gfn_t gfns[ASYNC_PF_PER_VCPU]; struct gfn_to_hva_cache data; u64 msr_en_val; /* MSR_KVM_ASYNC_PF_EN */ u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */ u16 vec; u32 id; u32 host_apf_flags; bool send_always; bool delivery_as_pf_vmexit; bool pageready_pending; } apf; /* OSVW MSRs (AMD only) */ struct { u64 length; u64 status; } osvw; struct { u64 msr_val; struct gfn_to_hva_cache data; } pv_eoi; u64 msr_kvm_poll_control; /* pv related host specific info */ struct { bool pv_unhalted; } pv; int pending_ioapic_eoi; int pending_external_vector; int highest_stale_pending_ioapic_eoi; /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ bool l1tf_flush_l1d; /* Host CPU on which VM-entry was most recently attempted */ int last_vmentry_cpu; /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; /* pv related cpuid info */ struct { /* * value of the eax register in the KVM_CPUID_FEATURES CPUID * leaf. */ u32 features; /* * indicates whether pv emulation should be disabled if features * are not present in the guest's cpuid */ bool enforce; } pv_cpuid; /* Protected Guests */ bool guest_state_protected; bool guest_tsc_protected; /* * Set when PDPTS were loaded directly by the userspace without * reading the guest memory */ bool pdptrs_from_userspace; #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; #endif }; struct kvm_lpage_info { int disallow_lpage; }; struct kvm_arch_memory_slot { struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; unsigned short *gfn_write_track; }; /* * Track the mode of the optimized logical map, as the rules for decoding the * destination vary per mode. Enabling the optimized logical map requires all * software-enabled local APIs to be in the same mode, each addressable APIC to * be mapped to only one MDA, and each MDA to map to at most one APIC. */ enum kvm_apic_logical_mode { /* All local APICs are software disabled. */ KVM_APIC_MODE_SW_DISABLED, /* All software enabled local APICs in xAPIC cluster addressing mode. */ KVM_APIC_MODE_XAPIC_CLUSTER, /* All software enabled local APICs in xAPIC flat addressing mode. */ KVM_APIC_MODE_XAPIC_FLAT, /* All software enabled local APICs in x2APIC mode. */ KVM_APIC_MODE_X2APIC, /* * Optimized map disabled, e.g. not all local APICs in the same logical * mode, same logical ID assigned to multiple APICs, etc. */ KVM_APIC_MODE_MAP_DISABLED, }; struct kvm_apic_map { struct rcu_head rcu; enum kvm_apic_logical_mode logical_mode; u32 max_apic_id; union { struct kvm_lapic *xapic_flat_map[8]; struct kvm_lapic *xapic_cluster_map[16][4]; }; struct kvm_lapic *phys_map[]; }; /* Hyper-V synthetic debugger (SynDbg)*/ struct kvm_hv_syndbg { struct { u64 control; u64 status; u64 send_page; u64 recv_page; u64 pending_page; } control; u64 options; }; /* Current state of Hyper-V TSC page clocksource */ enum hv_tsc_page_status { /* TSC page was not set up or disabled */ HV_TSC_PAGE_UNSET = 0, /* TSC page MSR was written by the guest, update pending */ HV_TSC_PAGE_GUEST_CHANGED, /* TSC page update was triggered from the host side */ HV_TSC_PAGE_HOST_CHANGED, /* TSC page was properly set up and is currently active */ HV_TSC_PAGE_SET, /* TSC page was set up with an inaccessible GPA */ HV_TSC_PAGE_BROKEN, }; #ifdef CONFIG_KVM_HYPERV /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; u64 hv_guest_os_id; u64 hv_hypercall; u64 hv_tsc_page; enum hv_tsc_page_status hv_tsc_page_status; /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; u64 hv_crash_ctl; struct ms_hyperv_tsc_page tsc_ref; struct idr conn_to_evt; u64 hv_reenlightenment_control; u64 hv_tsc_emulation_control; u64 hv_tsc_emulation_status; u64 hv_invtsc_control; /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; /* * How many SynICs use 'AutoEOI' feature * (protected by arch.apicv_update_lock) */ unsigned int synic_auto_eoi_used; struct kvm_hv_syndbg hv_syndbg; bool xsaves_xsavec_checked; }; #endif struct msr_bitmap_range { u32 flags; u32 nmsrs; u32 base; unsigned long *bitmap; }; #ifdef CONFIG_KVM_XEN /* Xen emulation context */ struct kvm_xen { struct mutex xen_lock; u32 xen_version; bool long_mode; bool runstate_update_flag; u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; struct idr evtchn_ports; unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; struct kvm_xen_hvm_config hvm_config; }; #endif enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; struct kvm_x86_msr_filter { u8 count; bool default_allow:1; struct msr_bitmap_range ranges[16]; }; struct kvm_x86_pmu_event_filter { __u32 action; __u32 nevents; __u32 fixed_counter_bitmap; __u32 flags; __u32 nr_includes; __u32 nr_excludes; __u64 *includes; __u64 *excludes; __u64 events[]; }; enum kvm_apicv_inhibit { /********************************************************************/ /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */ /********************************************************************/ /* * APIC acceleration is disabled by a module parameter * and/or not supported in hardware. */ APICV_INHIBIT_REASON_DISABLED, /* * APIC acceleration is inhibited because AutoEOI feature is * being used by a HyperV guest. */ APICV_INHIBIT_REASON_HYPERV, /* * APIC acceleration is inhibited because the userspace didn't yet * enable the kernel/split irqchip. */ APICV_INHIBIT_REASON_ABSENT, /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ * (out of band, debug measure of blocking all interrupts on this vCPU) * was enabled, to avoid AVIC/APICv bypassing it. */ APICV_INHIBIT_REASON_BLOCKIRQ, /* * APICv is disabled because not all vCPUs have a 1:1 mapping between * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack. */ APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED, /* * For simplicity, the APIC acceleration is inhibited * first time either APIC ID or APIC base are changed by the guest * from their reset values. */ APICV_INHIBIT_REASON_APIC_ID_MODIFIED, APICV_INHIBIT_REASON_APIC_BASE_MODIFIED, /******************************************************/ /* INHIBITs that are relevant only to the AMD's AVIC. */ /******************************************************/ /* * AVIC is inhibited on a vCPU because it runs a nested guest. * * This is needed because unlike APICv, the peers of this vCPU * cannot use the doorbell mechanism to signal interrupts via AVIC when * a vCPU runs nested. */ APICV_INHIBIT_REASON_NESTED, /* * On SVM, the wait for the IRQ window is implemented with pending vIRQ, * which cannot be injected when the AVIC is enabled, thus AVIC * is inhibited while KVM waits for IRQ window. */ APICV_INHIBIT_REASON_IRQWIN, /* * PIT (i8254) 're-inject' mode, relies on EOI intercept, * which AVIC doesn't support for edge triggered interrupts. */ APICV_INHIBIT_REASON_PIT_REINJ, /* * AVIC is disabled because SEV doesn't support it. */ APICV_INHIBIT_REASON_SEV, /* * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1 * mapping between logical ID and vCPU. */ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, /* * AVIC is disabled because the vCPU's APIC ID is beyond the max * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable. */ APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG, NR_APICV_INHIBIT_REASONS, }; #define __APICV_INHIBIT_REASON(reason) \ { BIT(APICV_INHIBIT_REASON_##reason), #reason } #define APICV_INHIBIT_REASONS \ __APICV_INHIBIT_REASON(DISABLED), \ __APICV_INHIBIT_REASON(HYPERV), \ __APICV_INHIBIT_REASON(ABSENT), \ __APICV_INHIBIT_REASON(BLOCKIRQ), \ __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \ __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \ __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \ __APICV_INHIBIT_REASON(NESTED), \ __APICV_INHIBIT_REASON(IRQWIN), \ __APICV_INHIBIT_REASON(PIT_REINJ), \ __APICV_INHIBIT_REASON(SEV), \ __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \ __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG) struct kvm_possible_nx_huge_pages { /* * A list of kvm_mmu_page structs that, if zapped, could possibly be * replaced by an NX huge page. A shadow page is on this list if its * existence disallows an NX huge page (nx_huge_page_disallowed is set) * and there are no other conditions that prevent a huge page, e.g. * the backing host page is huge, dirtly logging is not enabled for its * memslot, etc... Note, zapping shadow pages on this list doesn't * guarantee an NX huge page will be created in its stead, e.g. if the * guest attempts to execute from the region then KVM obviously can't * create an NX huge page (without hanging the guest). */ struct list_head pages; u64 nr_pages; }; enum kvm_mmu_type { KVM_SHADOW_MMU, #ifdef CONFIG_X86_64 KVM_TDP_MMU, #endif KVM_NR_MMU_TYPES, }; struct kvm_arch { unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; u8 mmu_valid_gen; u8 vm_type; bool has_private_mem; bool has_protected_state; bool has_protected_eoi; bool pre_fault_allowed; struct hlist_head *mmu_page_hash; struct list_head active_mmu_pages; struct kvm_possible_nx_huge_pages possible_nx_huge_pages[KVM_NR_MMU_TYPES]; #ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING struct kvm_page_track_notifier_head track_notifier_head; #endif /* * Protects marking pages unsync during page faults, as TDP MMU page * faults only take mmu_lock for read. For simplicity, the unsync * pages lock is always taken when marking pages unsync regardless of * whether mmu_lock is held for read or write. */ spinlock_t mmu_unsync_pages_lock; u64 shadow_mmio_value; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA atomic_t noncoherent_dma_count; unsigned long nr_possible_bypass_irqs; #ifdef CONFIG_KVM_IOAPIC struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; #endif atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; bool apic_access_memslot_enabled; bool apic_access_memslot_inhibited; /* Protects apicv_inhibit_reasons */ struct rw_semaphore apicv_update_lock; unsigned long apicv_inhibit_reasons; gpa_t wall_clock; u64 disabled_exits; s64 kvmclock_offset; /* * This also protects nr_vcpus_matched_tsc which is read from a * preemption-disabled region, so it must be a raw spinlock. */ raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; u64 last_tsc_write; u32 last_tsc_khz; u64 last_tsc_offset; u64 cur_tsc_nsec; u64 cur_tsc_write; u64 cur_tsc_offset; u64 cur_tsc_generation; int nr_vcpus_matched_tsc; u32 default_tsc_khz; bool user_set_tsc; u64 apic_bus_cycle_ns; seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; struct delayed_work kvmclock_update_work; struct delayed_work kvmclock_sync_work; #ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; #endif #ifdef CONFIG_KVM_XEN struct kvm_xen xen; #endif bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; u32 bsp_vcpu_id; u64 disabled_quirks; enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; bool disabled_lapic_found; bool x2apic_format; bool x2apic_broadcast_quirk_disabled; bool has_mapped_host_mmio; bool guest_can_read_msr_platform_info; bool exception_payload_enabled; bool triple_fault_event; bool bus_lock_detection_enabled; bool enable_pmu; u32 notify_window; u32 notify_vmexit_flags; /* * If exit_on_emulation_error is set, and the in-kernel instruction * emulator fails to emulate an instruction, allow userspace * the opportunity to look at it. */ bool exit_on_emulation_error; /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; struct kvm_x86_msr_filter __rcu *msr_filter; u32 hypercall_exit_enabled; /* Guest can access the SGX PROVISIONKEY. */ bool sgx_provisioning_allowed; struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter; struct vhost_task *nx_huge_page_recovery_thread; u64 nx_huge_page_last; struct once nx_once; #ifdef CONFIG_X86_64 #ifdef CONFIG_KVM_PROVE_MMU /* * The number of TDP MMU pages across all roots. Used only to sanity * check that KVM isn't leaking TDP MMU pages. */ atomic64_t tdp_mmu_pages; #endif /* * List of struct kvm_mmu_pages being used as roots. * All struct kvm_mmu_pages in the list should have * tdp_mmu_page set. * * For reads, this list is protected by: * RCU alone or * the MMU lock in read mode + RCU or * the MMU lock in write mode * * For writes, this list is protected by tdp_mmu_pages_lock; see * below for the details. * * Roots will remain in the list until their tdp_mmu_root_count * drops to zero, at which point the thread that decremented the * count to zero should removed the root from the list and clean * it up, freeing the root after an RCU grace period. */ struct list_head tdp_mmu_roots; /* * Protects accesses to the following fields when the MMU lock * is held in read mode: * - tdp_mmu_roots (above) * - the link field of kvm_mmu_page structs used by the TDP MMU * - possible_nx_huge_pages[KVM_TDP_MMU]; * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU * Because the lock is only taken within the MMU lock, strictly * speaking it is redundant to acquire this lock when the thread * holds the MMU lock in write mode. However it often simplifies * the code to do so. */ spinlock_t tdp_mmu_pages_lock; #endif /* CONFIG_X86_64 */ /* * If set, at least one shadow root has been allocated. This flag * is used as one input when determining whether certain memslot * related allocations are necessary. */ bool shadow_root_allocated; #ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING /* * If set, the VM has (or had) an external write tracking user, and * thus all write tracking metadata has been allocated, even if KVM * itself isn't using write tracking. */ bool external_write_tracking_enabled; #endif #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; struct hv_partition_assist_pg *hv_pa_pg; #endif /* * VM-scope maximum vCPU ID. Used to determine the size of structures * that increase along with the maximum vCPU ID, in which case, using * the global KVM_MAX_VCPU_IDS may lead to significant memory waste. */ u32 max_vcpu_ids; bool disable_nx_huge_pages; /* * Memory caches used to allocate shadow pages when performing eager * page splitting. No need for a shadowed_info_cache since eager page * splitting only allocates direct shadow pages. * * Protected by kvm->slots_lock. */ struct kvm_mmu_memory_cache split_shadow_page_cache; struct kvm_mmu_memory_cache split_page_header_cache; /* * Memory cache used to allocate pte_list_desc structs while splitting * huge pages. In the worst case, to split one huge page, 512 * pte_list_desc structs are needed to add each lower level leaf sptep * to the rmap plus 1 to extend the parent_ptes rmap of the lower level * page table. * * Protected by kvm->slots_lock. */ #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1) struct kvm_mmu_memory_cache split_desc_cache; gfn_t gfn_direct_bits; /* * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A Zero * value indicates CPU dirty logging is unsupported or disabled in * current VM. */ int cpu_dirty_log_size; }; struct kvm_vm_stat { struct kvm_vm_stat_generic generic; u64 mmu_shadow_zapped; u64 mmu_pte_write; u64 mmu_pde_zapped; u64 mmu_flooded; u64 mmu_recycled; u64 mmu_cache_miss; u64 mmu_unsync; union { struct { atomic64_t pages_4k; atomic64_t pages_2m; atomic64_t pages_1g; }; atomic64_t pages[KVM_NR_PAGE_SIZES]; }; u64 nx_lpage_splits; u64 max_mmu_page_hash_collisions; u64 max_mmu_rmap_size; }; struct kvm_vcpu_stat { struct kvm_vcpu_stat_generic generic; u64 pf_taken; u64 pf_fixed; u64 pf_emulate; u64 pf_spurious; u64 pf_fast; u64 pf_mmio_spte_created; u64 pf_guest; u64 tlb_flush; u64 invlpg; u64 exits; u64 io_exits; u64 mmio_exits; u64 signal_exits; u64 irq_window_exits; u64 nmi_window_exits; u64 l1d_flush; u64 halt_exits; u64 request_irq_exits; u64 irq_exits; u64 host_state_reload; u64 fpu_reload; u64 insn_emulation; u64 insn_emulation_fail; u64 hypercalls; u64 irq_injections; u64 nmi_injections; u64 req_event; u64 nested_run; u64 directed_yield_attempted; u64 directed_yield_successful; u64 preemption_reported; u64 preemption_other; u64 guest_mode; u64 notify_window_exits; }; struct x86_instruction_info; struct msr_data { bool host_initiated; u32 index; u64 data; }; struct kvm_lapic_irq { u32 vector; u16 delivery_mode; u16 dest_mode; bool level; u16 trig_mode; u32 shorthand; u32 dest_id; bool msi_redir_hint; }; static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) { return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; } enum kvm_x86_run_flags { KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), KVM_RUN_LOAD_GUEST_DR6 = BIT(1), KVM_RUN_LOAD_DEBUGCTL = BIT(2), }; struct kvm_x86_ops { const char *name; int (*check_processor_compatibility)(void); int (*enable_virtualization_cpu)(void); void (*disable_virtualization_cpu)(void); cpu_emergency_virt_cb *emergency_disable_virtualization_cpu; void (*hardware_unsetup)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); unsigned int vm_size; int (*vm_init)(struct kvm *kvm); void (*vm_destroy)(struct kvm *kvm); void (*vm_pre_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ int (*vcpu_precreate)(struct kvm *kvm); int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); void (*prepare_switch_to_guest)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); /* * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to * match the host's value even while the guest is active. */ const u64 HOST_OWNED_DEBUGCTL; void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); void (*get_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int (*get_cpl)(struct kvm_vcpu *vcpu); int (*get_cpl_no_cache)(struct kvm_vcpu *vcpu); void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); bool (*get_if_flag)(struct kvm_vcpu *vcpu); void (*flush_tlb_all)(struct kvm_vcpu *vcpu); void (*flush_tlb_current)(struct kvm_vcpu *vcpu); #if IS_ENABLED(CONFIG_HYPERV) int (*flush_remote_tlbs)(struct kvm *kvm); int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages); #endif /* * Flush any TLB entries associated with the given GVA. * Does not need to flush GPA->HPA mappings. * Can potentially get non-canonical addresses through INVLPGs, which * the implementation may choose to ignore if appropriate. */ void (*flush_tlb_gva)(struct kvm_vcpu *vcpu, gva_t addr); /* * Flush any TLB entries created by the guest. Like tlb_flush_gva(), * does not need to flush GPA->HPA mappings. */ void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, u64 run_flags); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*update_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected); void (*inject_nmi)(struct kvm_vcpu *vcpu); void (*inject_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); /* Whether or not a virtual NMI is pending in hardware. */ bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu); /* * Attempt to pend a virtual NMI in hardware. Returns %true on success * to allow using static_call_ret0 as the fallback. */ bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); const bool x2apic_icr_is_split; const unsigned long required_apicv_inhibits; bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode, int trig_mode, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); /* Update external mapping with page table link. */ int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, void *external_spt); /* Update the external page table from spte getting set. */ int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn_for_gfn); /* Update external page tables for page table about to be freed. */ int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, void *external_spt); /* Update external page table from spte getting removed, and flush TLB. */ int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn_for_gfn); bool (*has_wbinvd_exit)(void); u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu); /* * Retrieve somewhat arbitrary exit/entry information. Intended to * be used only from within tracepoints or error paths. */ void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); void (*get_entry_info)(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code); int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage, struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); const struct kvm_x86_nested_ops *nested_ops; void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct kvm_vcpu *vcpu, u32 vector); void (*pi_start_bypass)(struct kvm *kvm); void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); bool (*protected_apic_has_interrupt)(struct kvm_vcpu *vcpu); int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired); void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); void (*setup_mce)(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM_SMM int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram); int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram); void (*enable_smi_window)(struct kvm_vcpu *vcpu); #endif int (*dev_get_attr)(u32 group, u64 attr, u64 *val); int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); void (*guest_memory_reclaimed)(struct kvm *kvm); int (*get_feature_msr)(u32 msr, u64 *data); int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); void (*recalc_intercepts)(struct kvm_vcpu *vcpu); int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); /* * Returns vCPU specific APICv inhibit reasons */ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end); int (*gmem_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn, bool is_private); }; struct kvm_x86_nested_ops { void (*leave_nested)(struct kvm_vcpu *vcpu); bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector, u32 error_code); int (*check_events)(struct kvm_vcpu *vcpu); bool (*has_events)(struct kvm_vcpu *vcpu, bool for_injection); void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, unsigned user_data_size); int (*set_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu); int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu); void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu); }; struct kvm_x86_init_ops { int (*hardware_setup)(void); unsigned int (*handle_intel_pt_intr)(void); struct kvm_x86_ops *runtime_ops; struct kvm_pmu_ops *pmu_ops; }; struct kvm_arch_async_pf { u32 token; gfn_t gfn; unsigned long cr3; bool direct_map; u64 error_code; }; extern u32 __read_mostly kvm_nr_uret_msrs; extern bool __read_mostly allow_smaller_maxphyaddr; extern bool __read_mostly enable_apicv; extern bool __read_mostly enable_ipiv; extern bool __read_mostly enable_device_posted_irqs; extern struct kvm_x86_ops kvm_x86_ops; #define kvm_x86_call(func) static_call(kvm_x86_##func) #define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func) #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); #define KVM_X86_OP_OPTIONAL KVM_X86_OP #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops); void kvm_x86_vendor_exit(void); #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { return kvzalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT); } #define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); #if IS_ENABLED(CONFIG_HYPERV) #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { if (kvm_x86_ops.flush_remote_tlbs && !kvm_x86_call(flush_remote_tlbs)(kvm)) return 0; else return -ENOTSUPP; } #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { if (!kvm_x86_ops.flush_remote_tlbs_range) return -EOPNOTSUPP; return kvm_x86_call(flush_remote_tlbs_range)(kvm, gfn, nr_pages); } #endif /* CONFIG_HYPERV */ enum kvm_intr_type { /* Values are arbitrary, but must be non-zero. */ KVM_HANDLING_IRQ = 1, KVM_HANDLING_NMI, }; /* Enable perf NMI and timer modes to work, and minimise false positives. */ #define kvm_arch_pmi_in_guest(vcpu) \ ((vcpu) && (vcpu)->arch.handling_intr_from_guest && \ (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI))) void __init kvm_mmu_x86_module_init(void); int kvm_mmu_vendor_module_init(void); void kvm_mmu_vendor_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level); void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, int target_level); void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level); void kvm_mmu_recover_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); /* * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing * userspace I/O) to indicate that the emulation context * should be reused as is, i.e. skip initialization of * emulation context, instruction fetch and decode. * * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware. * Indicates that only select instructions (tagged with * EmulateOnUD) should be emulated (to minimize the emulator * attack surface). See also EMULTYPE_TRAP_UD_FORCED. * * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to * decode the instruction length. For use *only* by * kvm_x86_ops.skip_emulated_instruction() implementations if * EMULTYPE_COMPLETE_USER_EXIT is not set. * * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to * retry native execution under certain conditions, * Can only be set in conjunction with EMULTYPE_PF. * * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was * triggered by KVM's magic "force emulation" prefix, * which is opt in via module param (off by default). * Bypasses EmulateOnUD restriction despite emulating * due to an intercepted #UD (see EMULTYPE_TRAP_UD). * Used to test the full emulator from userspace. * * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware * backdoor emulation, which is opt in via module param. * VMware backdoor emulation handles select instructions * and reinjects the #GP for all other cases. * * EMULTYPE_PF - Set when an intercepted #PF triggers the emulation, in which case * the CR2/GPA value pass on the stack is valid. * * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility * state and inject single-step #DBs after skipping * an instruction (after completing userspace I/O). * * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that * is attempting to write a gfn that contains one or * more of the PTEs used to translate the write itself, * and the owning page table is being shadowed by KVM. * If emulation of the faulting instruction fails and * this flag is set, KVM will exit to userspace instead * of retrying emulation as KVM cannot make forward * progress. * * If emulation fails for a write to guest page tables, * KVM unprotects (zaps) the shadow page for the target * gfn and resumes the guest to retry the non-emulatable * instruction (on hardware). Unprotecting the gfn * doesn't allow forward progress for a self-changing * access because doing so also zaps the translation for * the gfn, i.e. retrying the instruction will hit a * !PRESENT fault, which results in a new shadow page * and sends KVM back to square one. */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) #define EMULTYPE_ALLOW_RETRY_PF (1 << 3) #define EMULTYPE_TRAP_UD_FORCED (1 << 4) #define EMULTYPE_VMWARE_GP (1 << 5) #define EMULTYPE_PF (1 << 6) #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) #define EMULTYPE_WRITE_PF_TO_SP (1 << 8) static inline bool kvm_can_emulate_event_vectoring(int emul_type) { return !(emul_type & EMULTYPE_PF); } int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len); void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, u8 ndata); void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); int kvm_emulate_as_nop(struct kvm_vcpu *vcpu); int kvm_emulate_invd(struct kvm_vcpu *vcpu); int kvm_emulate_mwait(struct kvm_vcpu *vcpu); int kvm_handle_invalid_op(struct kvm_vcpu *vcpu); int kvm_emulate_monitor(struct kvm_vcpu *vcpu); int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu); int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, bool has_error_code, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); static inline int __kvm_irq_line_state(unsigned long *irq_state, int irq_source_id, int level) { /* Logical OR for level trig interrupt */ if (level) __set_bit(irq_source_id, irq_state); else __clear_bit(irq_source_id, irq_state); return !!(*irq_state); } void kvm_inject_nmi(struct kvm_vcpu *vcpu); int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool always_retry); static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) { return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false); } void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); bool kvm_apicv_activated(struct kvm *kvm); bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu); void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set); void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set); static inline void kvm_set_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason) { kvm_set_or_clear_apicv_inhibit(kvm, reason, true); } static inline void kvm_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason) { kvm_set_or_clear_apicv_inhibit(kvm, reason, false); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, unsigned long roots); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) #endif #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) static inline u16 kvm_read_ldt(void) { u16 ldt; asm("sldt %0" : "=g"(ldt)); return ldt; } static inline void kvm_load_ldt(u16 sel) { asm("lldt %0" : : "rm"(sel)); } #ifdef CONFIG_X86_64 static inline unsigned long read_msr(unsigned long msr) { u64 value; rdmsrq(msr, value); return value; } #endif static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) #define TSS_REDIRECTION_SIZE (256 / 8) #define RMODE_TSS_SIZE \ (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) enum { TASK_SWITCH_CALL = 0, TASK_SWITCH_IRET = 1, TASK_SWITCH_JMP = 2, TASK_SWITCH_GATE = 3, }; #define HF_GUEST_MASK (1 << 0) /* VCPU is in guest-mode */ #ifdef CONFIG_KVM_SMM #define HF_SMM_MASK (1 << 1) #define HF_SMM_INSIDE_NMI_MASK (1 << 2) # define KVM_MAX_NR_ADDRESS_SPACES 2 /* SMM is currently unsupported for guests with private memory. */ # define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2) # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) #else # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) #endif int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_extint(struct kvm_vcpu *v); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); void kvm_user_return_msr_update_cache(unsigned int index, u64 val); u64 kvm_get_user_return_msr(unsigned int slot); static inline bool kvm_is_supported_user_return_msr(u32 msr) { return kvm_find_user_return_msr(msr) >= 0; } u64 kvm_scale_tsc(u64 tsc, u64 ratio); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); void kvm_make_scan_ioapic_request(struct kvm *kvm); void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap); bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu); bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) { /* We can only post Fixed and LowPrio IRQs */ return (irq->delivery_mode == APIC_DM_FIXED || irq->delivery_mode == APIC_DM_LOWEST); } static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { kvm_x86_call(vcpu_blocking)(vcpu); } static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { kvm_x86_call(vcpu_unblocking)(vcpu); } static inline int kvm_cpu_get_apicid(int mps_cpu) { #ifdef CONFIG_X86_LOCAL_APIC return default_cpu_present_to_apicid(mps_cpu); #else WARN_ON_ONCE(1); return BAD_APICID; #endif } int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); #define KVM_CLOCK_VALID_FLAGS \ (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC) #define KVM_X86_VALID_QUIRKS \ (KVM_X86_QUIRK_LINT0_REENABLED | \ KVM_X86_QUIRK_CD_NW_CLEARED | \ KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ KVM_X86_QUIRK_SLOT_ZAP_ALL | \ KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \ KVM_X86_QUIRK_IGNORE_GUEST_PAT) #define KVM_X86_CONDITIONAL_QUIRKS \ (KVM_X86_QUIRK_CD_NW_CLEARED | \ KVM_X86_QUIRK_IGNORE_GUEST_PAT) /* * KVM previously used a u32 field in kvm_run to indicate the hypercall was * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the * remaining 31 lower bits must be 0 to preserve ABI. */ #define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1) static inline bool kvm_arch_has_irq_bypass(void) { return enable_device_posted_irqs; } #endif /* _ASM_X86_KVM_HOST_H */
122 71 20 20 20 20 71 120 120 120 120 120 120 115 7 4 5 5 5 4 5 5 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> * * See Documentation/mm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt #include <linux/pgtable.h> #include <linux/moduleparam.h> #include <linux/bootmem_info.h> #include <linux/mmdebug.h> #include <linux/pagewalk.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" /** * struct vmemmap_remap_walk - walk vmemmap page table * * @remap_pte: called for each lowest-level entry (PTE). * @nr_walked: the number of walked pte. * @reuse_page: the page which is reused for the tail vmemmap pages. * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. * @flags: used to modify behavior in vmemmap page table walking * operations. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk); unsigned long nr_walked; struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; /* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) /* synchronize_rcu() to avoid writes from page_ref_add_unless() */ #define VMEMMAP_SYNCHRONIZE_RCU BIT(2) unsigned long flags; }; static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, struct vmemmap_remap_walk *walk) { pmd_t __pmd; int i; unsigned long addr = start; pte_t *pgtable; pgtable = pte_alloc_one_kernel(&init_mm); if (!pgtable) return -ENOMEM; pmd_populate_kernel(&init_mm, &__pmd, pgtable); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { pte_t entry, *pte; pgprot_t pgprot = PAGE_KERNEL; entry = mk_pte(head + i, pgprot); pte = pte_offset_kernel(&__pmd, addr); set_pte_at(&init_mm, addr, pte, entry); } spin_lock(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { /* * Higher order allocations from buddy allocator must be able to * be treated as indepdenent small pages (as they can be freed * individually). */ if (!PageReserved(head)) split_page(head, get_order(PMD_SIZE)); /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } spin_unlock(&init_mm.page_table_lock); return 0; } static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { int ret = 0; struct page *head; struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* Only splitting, not remapping the vmemmap pages. */ if (!vmemmap_walk->remap_pte) walk->action = ACTION_CONTINUE; spin_lock(&init_mm.page_table_lock); head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; /* * Due to HugeTLB alignment requirements and the vmemmap * pages being at the start of the hotplugged memory * region in memory_hotplug.memmap_on_memory case. Checking * the vmemmap page associated with the first vmemmap page * if it is self-hosted is sufficient. * * [ hotplugged memory ] * [ section ][...][ section ] * [ vmemmap ][ usable memory ] * ^ | ^ | * +--+ | | * +------------------------+ */ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { struct page *page = head ? head + pte_index(addr) : pte_page(ptep_get(pte_offset_kernel(pmd, addr))); if (PageVmemmapSelfHosted(page)) ret = -ENOTSUPP; } spin_unlock(&init_mm.page_table_lock); if (!head || ret) return ret; return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); } static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* * The reuse_page is found 'first' in page table walking before * starting remapping. */ if (!vmemmap_walk->reuse_page) vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); else vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); vmemmap_walk->nr_walked++; return 0; } static const struct mm_walk_ops vmemmap_remap_ops = { .pmd_entry = vmemmap_pmd_entry, .pte_entry = vmemmap_pte_entry, }; static int vmemmap_remap_range(unsigned long start, unsigned long end, struct vmemmap_remap_walk *walk) { int ret; VM_BUG_ON(!PAGE_ALIGNED(start | end)); mmap_read_lock(&init_mm); ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops, NULL, walk); mmap_read_unlock(&init_mm); if (ret) return ret; if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, end); return 0; } /* * Free a vmemmap page. A vmemmap page can be allocated from the memblock * allocator or buddy allocator. If the PG_reserved flag is set, it means * that it allocated from the memblock allocator, just free it via the * free_bootmem_page(). Otherwise, use __free_page(). */ static inline void free_vmemmap_page(struct page *page) { if (PageReserved(page)) { memmap_boot_pages_add(-1); free_bootmem_page(page); } else { memmap_pages_add(-1); __free_page(page); } } /* Free a list of the vmemmap pages */ static void free_vmemmap_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) free_vmemmap_page(page); } static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { /* * Remap the tail pages as read-only to catch illegal write operation * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ if (unlikely(addr == walk->reuse_addr)) { pgprot = PAGE_KERNEL; list_del(&walk->reuse_page->lru); /* * Makes sure that preceding stores to the page contents from * vmemmap_remap_free() become visible before the set_pte_at() * write. */ smp_wmb(); } entry = mk_pte(walk->reuse_page, pgprot); list_add(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } /* * How many struct page structs need to be reset. When we reuse the head * struct page, the special metadata (e.g. page->flags or page->mapping) * cannot copy to the tail struct page structs. The invalid value will be * checked in the free_tail_page_prepare(). In order to avoid the message * of "corrupted mapping in tail page". We need to reset at least 4 (one * head struct page struct and three tail struct page structs) struct page * structs. */ #define NR_RESET_STRUCT_PAGE 4 static inline void reset_struct_pages(struct page *start) { struct page *from = start + NR_RESET_STRUCT_PAGE; BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); } static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { pgprot_t pgprot = PAGE_KERNEL; struct page *page; void *to; BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); reset_struct_pages(to); /* * Makes sure that preceding stores to the page contents become visible * before the set_pte_at() write. */ smp_wmb(); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } /** * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) * backing PMDs of the directmap into PTEs * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_split(unsigned long start, unsigned long end, unsigned long reuse) { struct vmemmap_remap_walk walk = { .remap_pte = NULL, .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, }; /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); return vmemmap_remap_range(reuse, end, &walk); } /** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * to the page which @reuse is mapped to, then free vmemmap * which the range are mapped to. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * responsibility to free pages. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse, struct list_head *vmemmap_pages, unsigned long flags) { int ret; struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, .flags = flags, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; /* * Allocate a new head vmemmap page to avoid breaking a contiguous * block of struct page memory when freeing it back to page allocator * in free_vmemmap_page_list(). This will allow the likely contiguous * struct page backing memory to be kept contiguous and allowing for * more allocations of hugepages. Fallback to the currently * mapped head page in case should it fail to allocate. */ walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); if (walk.reuse_page) { copy_page(page_to_virt(walk.reuse_page), (void *)walk.reuse_addr); list_add(&walk.reuse_page->lru, vmemmap_pages); memmap_pages_add(1); } /* * In order to make remapping routine most efficient for the huge pages, * the routine of vmemmap page table walking has the following rules * (see more details from the vmemmap_pte_range()): * * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) * should be continuous. * - The @reuse address is part of the range [@reuse, @end) that we are * walking which is passed to vmemmap_remap_range(). * - The @reuse address is the first in the complete range. * * So we need to make sure that @start and @reuse meet the above rules. */ BUG_ON(start - reuse != PAGE_SIZE); ret = vmemmap_remap_range(reuse, end, &walk); if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* * vmemmap_pages contains pages from the previous * vmemmap_remap_range call which failed. These * are pages which were removed from the vmemmap. * They will be restored in the following call. */ walk = (struct vmemmap_remap_walk) { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, .flags = 0, }; vmemmap_remap_range(reuse, end, &walk); } return ret; } static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, struct list_head *list) { gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; int i; for (i = 0; i < nr_pages; i++) { page = alloc_pages_node(nid, gfp_mask, 0); if (!page) goto out; list_add(&page->lru, list); } memmap_pages_add(nr_pages); return 0; out: list_for_each_entry_safe(page, next, list, lru) __free_page(page); return -ENOMEM; } /** * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) * to the page which is from the @vmemmap_pages * respectively. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, unsigned long flags) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, .flags = flags, }; /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) return -ENOMEM; return vmemmap_remap_range(reuse, end, &walk); } DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); static int __init hugetlb_vmemmap_optimize_param(char *buf) { return kstrtobool(buf, &vmemmap_optimize_enabled); } early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param); static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) { int ret; unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; if (flags & VMEMMAP_SYNCHRONIZE_RCU) synchronize_rcu(); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * The pages which the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end) are mapped to are freed to the buddy allocator, and * the range is mapped to the page which @vmemmap_reuse is mapped to. * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); if (!ret) { folio_clear_hugetlb_vmemmap_optimized(folio); static_branch_dec(&hugetlb_optimize_vmemmap_key); } return ret; } /** * hugetlb_vmemmap_restore_folio - restore previously optimized (by * hugetlb_vmemmap_optimize_folio()) vmemmap pages which * will be reallocated and remapped. * @h: struct hstate. * @folio: the folio whose vmemmap pages will be restored. * * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, * negative error code otherwise. */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); } /** * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. * @h: hstate. * @folio_list: list of folios. * @non_hvo_folios: Output list of folios for which vmemmap exists. * * Return: number of folios for which vmemmap was restored, or an error code * if an error was encountered restoring vmemmap for a folio. * Folios that have vmemmap are moved to the non_hvo_folios * list. Processing of entries stops when the first error is * encountered. The folio that experienced the error and all * non-processed folios will remain on folio_list. */ long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; long restored = 0; long ret = 0; unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); /* only need to synchronize_rcu() once for each batch */ flags &= ~VMEMMAP_SYNCHRONIZE_RCU; if (ret) break; restored++; } /* Add non-optimized folios to output list */ list_move(&folio->lru, non_hvo_folios); } if (restored) flush_tlb_all(); if (!ret) ret = restored; return ret; } /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) { if (folio_test_hugetlb_vmemmap_optimized(folio)) return false; if (!READ_ONCE(vmemmap_optimize_enabled)) return false; if (!hugetlb_vmemmap_optimizable(h)) return false; return true; } static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio, struct list_head *vmemmap_pages, unsigned long flags) { int ret = 0; unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); if (!vmemmap_should_optimize_folio(h, folio)) return ret; static_branch_inc(&hugetlb_optimize_vmemmap_key); if (flags & VMEMMAP_SYNCHRONIZE_RCU) synchronize_rcu(); /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed * immediately after remapping. As a result, subsequent accesses * and modifications to struct pages associated with the hugetlb * page could be to the OLD struct pages. Set the vmemmap optimized * flag here so that it is copied to the new head page. This keeps * the old and new struct pages in sync. * If there is an error during optimization, we will immediately FLUSH * the TLB and clear the flag below. */ folio_set_hugetlb_vmemmap_optimized(folio); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) * to the page which @vmemmap_reuse is mapped to. Add pages previously * mapping the range to vmemmap_pages list so that they can be freed by * the caller. */ ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages, flags); if (ret) { static_branch_dec(&hugetlb_optimize_vmemmap_key); folio_clear_hugetlb_vmemmap_optimized(folio); } return ret; } /** * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. * @h: struct hstate. * @folio: the folio whose vmemmap pages will be optimized. * * This function only tries to optimize @folio's vmemmap pages and does not * guarantee that the optimization will succeed after it returns. The caller * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's * vmemmap pages have been optimized. */ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); free_vmemmap_page_list(&vmemmap_pages); } static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) { unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; if (!vmemmap_should_optimize_folio(h, folio)) return 0; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * Split PMDs on the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end] */ return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); } static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list, bool boot) { struct folio *folio; int nr_to_optimize; LIST_HEAD(vmemmap_pages); unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; nr_to_optimize = 0; list_for_each_entry(folio, folio_list, lru) { int ret; unsigned long spfn, epfn; if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) { /* * Already optimized by pre-HVO, just map the * mirrored tail page structs RO. */ spfn = (unsigned long)&folio->page; epfn = spfn + pages_per_huge_page(h); vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio), HUGETLB_VMEMMAP_RESERVE_SIZE); register_page_bootmem_memmap(pfn_to_section_nr(spfn), &folio->page, HUGETLB_VMEMMAP_RESERVE_SIZE); static_branch_inc(&hugetlb_optimize_vmemmap_key); continue; } nr_to_optimize++; ret = hugetlb_vmemmap_split_folio(h, folio); /* * Spliting the PMD requires allocating a page, thus lets fail * early once we encounter the first OOM. No point in retrying * as it can be dynamically done on remap with the memory * we get back from the vmemmap deduplication. */ if (ret == -ENOMEM) break; } if (!nr_to_optimize) /* * All pre-HVO folios, nothing left to do. It's ok if * there is a mix of pre-HVO and not yet HVO-ed folios * here, as __hugetlb_vmemmap_optimize_folio() will * skip any folios that already have the optimized flag * set, see vmemmap_should_optimize_folio(). */ goto out; flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) { int ret; ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); /* only need to synchronize_rcu() once for each batch */ flags &= ~VMEMMAP_SYNCHRONIZE_RCU; /* * Pages to be freed may have been accumulated. If we * encounter an ENOMEM, free what we have and try again. * This can occur in the case that both spliting fails * halfway and head page allocation also failed. In this * case __hugetlb_vmemmap_optimize_folio() would free memory * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); } } out: flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); } void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { __hugetlb_vmemmap_optimize_folios(h, folio_list, false); } void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list) { __hugetlb_vmemmap_optimize_folios(h, folio_list, true); } #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT /* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */ static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m) { unsigned long section_size, psize, pmd_vmemmap_size; phys_addr_t paddr; if (!READ_ONCE(vmemmap_optimize_enabled)) return false; if (!hugetlb_vmemmap_optimizable(m->hstate)) return false; psize = huge_page_size(m->hstate); paddr = virt_to_phys(m); /* * Pre-HVO only works if the bootmem huge page * is aligned to the section size. */ section_size = (1UL << PA_SECTION_SHIFT); if (!IS_ALIGNED(paddr, section_size) || !IS_ALIGNED(psize, section_size)) return false; /* * The pre-HVO code does not deal with splitting PMDS, * so the bootmem page must be aligned to the number * of base pages that can be mapped with one vmemmap PMD. */ pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT; if (!IS_ALIGNED(paddr, pmd_vmemmap_size) || !IS_ALIGNED(psize, pmd_vmemmap_size)) return false; return true; } /* * Initialize memmap section for a gigantic page, HVO-style. */ void __init hugetlb_vmemmap_init_early(int nid) { unsigned long psize, paddr, section_size; unsigned long ns, i, pnum, pfn, nr_pages; unsigned long start, end; struct huge_bootmem_page *m = NULL; void *map; /* * Noting to do if bootmem pages were not allocated * early in boot, or if HVO wasn't enabled in the * first place. */ if (!hugetlb_bootmem_allocated()) return; if (!READ_ONCE(vmemmap_optimize_enabled)) return; section_size = (1UL << PA_SECTION_SHIFT); list_for_each_entry(m, &huge_boot_pages[nid], list) { if (!vmemmap_should_optimize_bootmem_page(m)) continue; nr_pages = pages_per_huge_page(m->hstate); psize = nr_pages << PAGE_SHIFT; paddr = virt_to_phys(m); pfn = PHYS_PFN(paddr); map = pfn_to_page(pfn); start = (unsigned long)map; end = start + nr_pages * sizeof(struct page); if (vmemmap_populate_hvo(start, end, nid, HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) continue; memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE); pnum = pfn_to_section_nr(pfn); ns = psize / section_size; for (i = 0; i < ns; i++) { sparse_init_early_section(nid, map, pnum, SECTION_IS_VMEMMAP_PREINIT); map += section_map_size(); pnum++; } m->flags |= HUGE_BOOTMEM_HVO; } } void __init hugetlb_vmemmap_init_late(int nid) { struct huge_bootmem_page *m, *tm; unsigned long phys, nr_pages, start, end; unsigned long pfn, nr_mmap; struct hstate *h; void *map; if (!hugetlb_bootmem_allocated()) return; if (!READ_ONCE(vmemmap_optimize_enabled)) return; list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { if (!(m->flags & HUGE_BOOTMEM_HVO)) continue; phys = virt_to_phys(m); h = m->hstate; pfn = PHYS_PFN(phys); nr_pages = pages_per_huge_page(h); if (!hugetlb_bootmem_page_zones_valid(nid, m)) { /* * Oops, the hugetlb page spans multiple zones. * Remove it from the list, and undo HVO. */ list_del(&m->list); map = pfn_to_page(pfn); start = (unsigned long)map; end = start + nr_pages * sizeof(struct page); vmemmap_undo_hvo(start, end, nid, HUGETLB_VMEMMAP_RESERVE_SIZE); nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE; memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); memblock_phys_free(phys, huge_page_size(h)); continue; } else m->flags |= HUGE_BOOTMEM_ZONES_VALID; } } #endif static const struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", .data = &vmemmap_optimize_enabled, .maxlen = sizeof(vmemmap_optimize_enabled), .mode = 0644, .proc_handler = proc_dobool, }, }; static int __init hugetlb_vmemmap_init(void) { const struct hstate *h; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { register_sysctl_init("vm", hugetlb_vmemmap_sysctls); break; } } return 0; } late_initcall(hugetlb_vmemmap_init);
249 116 119 119 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PAGE_OWNER_H #define __LINUX_PAGE_OWNER_H #include <linux/jump_label.h> #ifdef CONFIG_PAGE_OWNER extern struct static_key_false page_owner_inited; extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned short order); extern void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask); extern void __split_page_owner(struct page *page, int old_order, int new_order); extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __folio_set_owner_migrate_reason(struct folio *folio, int reason); extern void __dump_page_owner(const struct page *page); extern void pagetypeinfo_showmixedcount_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone); static inline void reset_page_owner(struct page *page, unsigned short order) { if (static_branch_unlikely(&page_owner_inited)) __reset_page_owner(page, order); } static inline void set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { if (static_branch_unlikely(&page_owner_inited)) __set_page_owner(page, order, gfp_mask); } static inline void split_page_owner(struct page *page, int old_order, int new_order) { if (static_branch_unlikely(&page_owner_inited)) __split_page_owner(page, old_order, new_order); } static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { if (static_branch_unlikely(&page_owner_inited)) __folio_copy_owner(newfolio, old); } static inline void folio_set_owner_migrate_reason(struct folio *folio, int reason) { if (static_branch_unlikely(&page_owner_inited)) __folio_set_owner_migrate_reason(folio, reason); } static inline void dump_page_owner(const struct page *page) { if (static_branch_unlikely(&page_owner_inited)) __dump_page_owner(page); } #else static inline void reset_page_owner(struct page *page, unsigned short order) { } static inline void set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { } static inline void split_page_owner(struct page *page, int old_order, int new_order) { } static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) { } static inline void folio_set_owner_migrate_reason(struct folio *folio, int reason) { } static inline void dump_page_owner(const struct page *page) { } #endif /* CONFIG_PAGE_OWNER */ #endif /* __LINUX_PAGE_OWNER_H */
14 14 14 14 14 14 14 14 14 14 727 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 // SPDX-License-Identifier: GPL-2.0 /* * USB-ACPI glue code * * Copyright 2012 Red Hat <mjg@redhat.com> */ #include <linux/module.h> #include <linux/usb.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/acpi.h> #include <linux/pci.h> #include <linux/usb/hcd.h> #include "hub.h" /** * usb_acpi_power_manageable - check whether usb port has * acpi power resource. * @hdev: USB device belonging to the usb hub * @index: port index based zero * * Return true if the port has acpi power resource and false if no. */ bool usb_acpi_power_manageable(struct usb_device *hdev, int index) { acpi_handle port_handle; int port1 = index + 1; port_handle = usb_get_hub_port_acpi_handle(hdev, port1); if (port_handle) return acpi_bus_power_manageable(port_handle); else return false; } EXPORT_SYMBOL_GPL(usb_acpi_power_manageable); #define UUID_USB_CONTROLLER_DSM "ce2ee385-00e6-48cb-9f05-2edb927c4899" #define USB_DSM_DISABLE_U1_U2_FOR_PORT 5 /** * usb_acpi_port_lpm_incapable - check if lpm should be disabled for a port. * @hdev: USB device belonging to the usb hub * @index: zero based port index * * Some USB3 ports may not support USB3 link power management U1/U2 states * due to different retimer setup. ACPI provides _DSM method which returns 0x01 * if U1 and U2 states should be disabled. Evaluate _DSM with: * Arg0: UUID = ce2ee385-00e6-48cb-9f05-2edb927c4899 * Arg1: Revision ID = 0 * Arg2: Function Index = 5 * Arg3: (empty) * * Return 1 if USB3 port is LPM incapable, negative on error, otherwise 0 */ int usb_acpi_port_lpm_incapable(struct usb_device *hdev, int index) { union acpi_object *obj; acpi_handle port_handle; int port1 = index + 1; guid_t guid; int ret; ret = guid_parse(UUID_USB_CONTROLLER_DSM, &guid); if (ret) return ret; port_handle = usb_get_hub_port_acpi_handle(hdev, port1); if (!port_handle) { dev_dbg(&hdev->dev, "port-%d no acpi handle\n", port1); return -ENODEV; } if (!acpi_check_dsm(port_handle, &guid, 0, BIT(USB_DSM_DISABLE_U1_U2_FOR_PORT))) { dev_dbg(&hdev->dev, "port-%d no _DSM function %d\n", port1, USB_DSM_DISABLE_U1_U2_FOR_PORT); return -ENODEV; } obj = acpi_evaluate_dsm_typed(port_handle, &guid, 0, USB_DSM_DISABLE_U1_U2_FOR_PORT, NULL, ACPI_TYPE_INTEGER); if (!obj) { dev_dbg(&hdev->dev, "evaluate port-%d _DSM failed\n", port1); return -EINVAL; } if (obj->integer.value == 0x01) ret = 1; ACPI_FREE(obj); return ret; } EXPORT_SYMBOL_GPL(usb_acpi_port_lpm_incapable); /** * usb_acpi_set_power_state - control usb port's power via acpi power * resource * @hdev: USB device belonging to the usb hub * @index: port index based zero * @enable: power state expected to be set * * Notice to use usb_acpi_power_manageable() to check whether the usb port * has acpi power resource before invoking this function. * * Returns 0 on success, else negative errno. */ int usb_acpi_set_power_state(struct usb_device *hdev, int index, bool enable) { struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_port *port_dev; acpi_handle port_handle; unsigned char state; int port1 = index + 1; int error = -EINVAL; if (!hub) return -ENODEV; port_dev = hub->ports[port1 - 1]; port_handle = (acpi_handle) usb_get_hub_port_acpi_handle(hdev, port1); if (!port_handle) return error; if (enable) state = ACPI_STATE_D0; else state = ACPI_STATE_D3_COLD; error = acpi_bus_set_power(port_handle, state); if (!error) dev_dbg(&port_dev->dev, "acpi: power was set to %d\n", enable); else dev_dbg(&port_dev->dev, "acpi: power failed to be set\n"); return error; } EXPORT_SYMBOL_GPL(usb_acpi_set_power_state); /** * usb_acpi_add_usb4_devlink - add device link to USB4 Host Interface for tunneled USB3 devices * * @udev: Tunneled USB3 device connected to a roothub. * * Adds a device link between a tunneled USB3 device and the USB4 Host Interface * device to ensure correct runtime PM suspend and resume order. This function * should only be called for tunneled USB3 devices. * The USB4 Host Interface this tunneled device depends on is found from the roothub * port ACPI device specific data _DSD entry. * * Return: negative error code on failure, 0 otherwise */ static int usb_acpi_add_usb4_devlink(struct usb_device *udev) { struct device_link *link; struct usb_port *port_dev; struct usb_hub *hub; if (!udev->parent || udev->parent->parent) return 0; hub = usb_hub_to_struct_hub(udev->parent); if (!hub) return 0; port_dev = hub->ports[udev->portnum - 1]; struct fwnode_handle *nhi_fwnode __free(fwnode_handle) = fwnode_find_reference(dev_fwnode(&port_dev->dev), "usb4-host-interface", 0); if (IS_ERR(nhi_fwnode) || !nhi_fwnode->dev) return 0; link = device_link_add(&port_dev->child->dev, nhi_fwnode->dev, DL_FLAG_STATELESS | DL_FLAG_RPM_ACTIVE | DL_FLAG_PM_RUNTIME); if (!link) { dev_err(&port_dev->dev, "Failed to created device link from %s to %s\n", dev_name(&port_dev->child->dev), dev_name(nhi_fwnode->dev)); return -EINVAL; } dev_dbg(&port_dev->dev, "Created device link from %s to %s\n", dev_name(&port_dev->child->dev), dev_name(nhi_fwnode->dev)); udev->usb4_link = link; return 0; } /* * Private to usb-acpi, all the core needs to know is that * port_dev->location is non-zero when it has been set by the firmware. */ #define USB_ACPI_LOCATION_VALID (1 << 31) static void usb_acpi_get_connect_type(struct usb_port *port_dev, acpi_handle *handle) { enum usb_port_connect_type connect_type = USB_PORT_CONNECT_TYPE_UNKNOWN; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *upc = NULL; struct acpi_pld_info *pld = NULL; acpi_status status; /* * According to 9.14 in ACPI Spec 6.2. _PLD indicates whether usb port * is user visible and _UPC indicates whether it is connectable. If * the port was visible and connectable, it could be freely connected * and disconnected with USB devices. If no visible and connectable, * a usb device is directly hard-wired to the port. If no visible and * no connectable, the port would be not used. */ if (acpi_get_physical_device_location(handle, &pld) && pld) port_dev->location = USB_ACPI_LOCATION_VALID | pld->group_token << 8 | pld->group_position; status = acpi_evaluate_object(handle, "_UPC", NULL, &buffer); if (ACPI_FAILURE(status)) goto out; upc = buffer.pointer; if (!upc || (upc->type != ACPI_TYPE_PACKAGE) || upc->package.count != 4) goto out; /* UPC states port is connectable */ if (upc->package.elements[0].integer.value) if (!pld) ; /* keep connect_type as unknown */ else if (pld->user_visible) connect_type = USB_PORT_CONNECT_TYPE_HOT_PLUG; else connect_type = USB_PORT_CONNECT_TYPE_HARD_WIRED; else connect_type = USB_PORT_NOT_USED; out: port_dev->connect_type = connect_type; kfree(upc); ACPI_FREE(pld); } static struct acpi_device * usb_acpi_get_companion_for_port(struct usb_port *port_dev) { struct usb_device *udev; struct acpi_device *adev; acpi_handle *parent_handle; int port1; /* Get the struct usb_device point of port's hub */ udev = to_usb_device(port_dev->dev.parent->parent); /* * The root hub ports' parent is the root hub. The non-root-hub * ports' parent is the parent hub port which the hub is * connected to. */ if (!udev->parent) { adev = ACPI_COMPANION(&udev->dev); port1 = usb_hcd_find_raw_port_number(bus_to_hcd(udev->bus), port_dev->portnum); } else { parent_handle = usb_get_hub_port_acpi_handle(udev->parent, udev->portnum); if (!parent_handle) return NULL; adev = acpi_fetch_acpi_dev(parent_handle); port1 = port_dev->portnum; } return acpi_find_child_by_adr(adev, port1); } static struct acpi_device * usb_acpi_find_companion_for_port(struct usb_port *port_dev) { struct acpi_device *adev; adev = usb_acpi_get_companion_for_port(port_dev); if (!adev) return NULL; usb_acpi_get_connect_type(port_dev, adev->handle); return adev; } static struct acpi_device * usb_acpi_find_companion_for_device(struct usb_device *udev) { struct acpi_device *adev; struct usb_port *port_dev; struct usb_hub *hub; if (!udev->parent) { /* * root hub is only child (_ADR=0) under its parent, the HC. * sysdev pointer is the HC as seen from firmware. */ adev = ACPI_COMPANION(udev->bus->sysdev); return acpi_find_child_device(adev, 0, false); } hub = usb_hub_to_struct_hub(udev->parent); if (!hub) return NULL; /* Tunneled USB3 devices depend on USB4 Host Interface, set device link to it */ if (udev->speed >= USB_SPEED_SUPER && udev->tunnel_mode != USB_LINK_NATIVE) usb_acpi_add_usb4_devlink(udev); /* * This is an embedded USB device connected to a port and such * devices share port's ACPI companion. */ port_dev = hub->ports[udev->portnum - 1]; return usb_acpi_get_companion_for_port(port_dev); } static struct acpi_device *usb_acpi_find_companion(struct device *dev) { /* * The USB hierarchy like following: * * Device (EHC1) * Device (HUBN) * Device (PR01) * Device (PR11) * Device (PR12) * Device (FN12) * Device (FN13) * Device (PR13) * ... * where HUBN is root hub, and PRNN are USB ports and devices * connected to them, and FNNN are individualk functions for * connected composite USB devices. PRNN and FNNN may contain * _CRS and other methods describing sideband resources for * the connected device. * * On the kernel side both root hub and embedded USB devices are * represented as instances of usb_device structure, and ports * are represented as usb_port structures, so the whole process * is split into 2 parts: finding companions for devices and * finding companions for ports. * * Note that we do not handle individual functions of composite * devices yet, for that we would need to assign companions to * devices corresponding to USB interfaces. */ if (is_usb_device(dev)) return usb_acpi_find_companion_for_device(to_usb_device(dev)); else if (is_usb_port(dev)) return usb_acpi_find_companion_for_port(to_usb_port(dev)); return NULL; } static bool usb_acpi_bus_match(struct device *dev) { return is_usb_device(dev) || is_usb_port(dev); } static struct acpi_bus_type usb_acpi_bus = { .name = "USB", .match = usb_acpi_bus_match, .find_companion = usb_acpi_find_companion, }; int usb_acpi_register(void) { return register_acpi_bus_type(&usb_acpi_bus); } void usb_acpi_unregister(void) { unregister_acpi_bus_type(&usb_acpi_bus); }
5 3 6 3 1 1 2 4 16 7 13 12 2 1 1 10 9 14 9 9 7 4 3 14 34 34 33 34 869 1000 36 14 36 16 16 3317 27 3329 99 3318 1376 1347 45 8 1367 2259 2269 433 435 2 270 24 3 7 299 180 182 185 178 158 209 203 223 45 75 64 62 63 33 31 32 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * * This file contains the interface functions for the various time related * system calls: time, stime, gettimeofday, settimeofday, adjtime * * Modification history: * * 1993-09-02 Philip Gladstone * Created file with time related functions from sched/core.c and adjtimex() * 1993-10-08 Torsten Duwe * adjtime interface update and CMOS clock write code * 1995-08-13 Torsten Duwe * kernel PLL updated to 1994-12-13 specs (rfc-1589) * 1999-01-16 Ulrich Windl * Introduced error checking for many cases in adjtimex(). * Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) * (Even though the technical memorandum forbids it) * 2004-07-14 Christoph Lameter * Added getnstimeofday to allow the posix timer functions to return * with nanosecond accuracy */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/timex.h> #include <linux/capability.h> #include <linux/timekeeper_internal.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/fs.h> #include <linux/math64.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/compat.h> #include <asm/unistd.h> #include <generated/timeconst.h> #include "timekeeping.h" /* * The timezone where the local system is located. Used as a default by some * programs who obtain this value by using gettimeofday. */ struct timezone sys_tz; EXPORT_SYMBOL(sys_tz); #ifdef __ARCH_WANT_SYS_TIME /* * sys_time() can be implemented in user-level using * sys_gettimeofday(). Is this for backwards compatibility? If so, * why not move it into the appropriate arch directory (for those * architectures that need it). */ SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc) { __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) return -EFAULT; } force_successful_syscall_return(); return i; } /* * sys_stime() can be implemented in user-level using * sys_settimeofday(). Is this for backwards compatibility? If so, * why not move it into the appropriate arch directory (for those * architectures that need it). */ SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr) { struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) return -EFAULT; tv.tv_nsec = 0; err = security_settime64(&tv, NULL); if (err) return err; do_settimeofday64(&tv); return 0; } #endif /* __ARCH_WANT_SYS_TIME */ #ifdef CONFIG_COMPAT_32BIT_TIME #ifdef __ARCH_WANT_SYS_TIME32 /* old_time32_t is a 32 bit "long" and needs to get converted. */ SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc) { old_time32_t i; i = (old_time32_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) return -EFAULT; } force_successful_syscall_return(); return i; } SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) { struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) return -EFAULT; tv.tv_nsec = 0; err = security_settime64(&tv, NULL); if (err) return err; do_settimeofday64(&tv); return 0; } #endif /* __ARCH_WANT_SYS_TIME32 */ #endif SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { struct timespec64 ts; ktime_get_real_ts64(&ts); if (put_user(ts.tv_sec, &tv->tv_sec) || put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (unlikely(tz != NULL)) { if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) return -EFAULT; } return 0; } /* * In case for some reason the CMOS clock has not already been running * in UTC, but in some local time: The first time we set the timezone, * we will warp the clock so that it is ticking UTC time instead of * local time. Presumably, if someone is setting the timezone then we * are running in an environment where the programs understand about * timezones. This should be done at boot time in the /etc/rc script, * as soon as possible, so that the clock can be set right. Otherwise, * various programs will get confused when the clock gets warped. */ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz) { static int firsttime = 1; int error = 0; if (tv && !timespec64_valid_settod(tv)) return -EINVAL; error = security_settime64(tv, tz); if (error) return error; if (tz) { /* Verify we're within the +-15 hrs range */ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) return -EINVAL; sys_tz = *tz; update_vsyscall_tz(); if (firsttime) { firsttime = 0; if (!tv) timekeeping_warp_clock(); } } if (tv) return do_settimeofday64(tv); return 0; } SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; struct timezone new_tz; if (tv) { if (get_user(new_ts.tv_sec, &tv->tv_sec) || get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) return -EFAULT; } return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { if (tv) { struct timespec64 ts; ktime_get_real_ts64(&ts); if (put_user(ts.tv_sec, &tv->tv_sec) || put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (tz) { if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) return -EFAULT; } return 0; } COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; struct timezone new_tz; if (tv) { if (get_user(new_ts.tv_sec, &tv->tv_sec) || get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) return -EFAULT; } return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } #endif #ifdef CONFIG_64BIT SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p) { struct __kernel_timex txc; /* Local copy of parameter */ int ret; /* Copy the user data space into the kernel copy * structure. But bear in mind that the structures * may change */ if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex))) return -EFAULT; ret = do_adjtimex(&txc); return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret; } #endif #ifdef CONFIG_COMPAT_32BIT_TIME int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp) { struct old_timex32 tx32; memset(txc, 0, sizeof(struct __kernel_timex)); if (copy_from_user(&tx32, utp, sizeof(struct old_timex32))) return -EFAULT; txc->modes = tx32.modes; txc->offset = tx32.offset; txc->freq = tx32.freq; txc->maxerror = tx32.maxerror; txc->esterror = tx32.esterror; txc->status = tx32.status; txc->constant = tx32.constant; txc->precision = tx32.precision; txc->tolerance = tx32.tolerance; txc->time.tv_sec = tx32.time.tv_sec; txc->time.tv_usec = tx32.time.tv_usec; txc->tick = tx32.tick; txc->ppsfreq = tx32.ppsfreq; txc->jitter = tx32.jitter; txc->shift = tx32.shift; txc->stabil = tx32.stabil; txc->jitcnt = tx32.jitcnt; txc->calcnt = tx32.calcnt; txc->errcnt = tx32.errcnt; txc->stbcnt = tx32.stbcnt; return 0; } int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc) { struct old_timex32 tx32; memset(&tx32, 0, sizeof(struct old_timex32)); tx32.modes = txc->modes; tx32.offset = txc->offset; tx32.freq = txc->freq; tx32.maxerror = txc->maxerror; tx32.esterror = txc->esterror; tx32.status = txc->status; tx32.constant = txc->constant; tx32.precision = txc->precision; tx32.tolerance = txc->tolerance; tx32.time.tv_sec = txc->time.tv_sec; tx32.time.tv_usec = txc->time.tv_usec; tx32.tick = txc->tick; tx32.ppsfreq = txc->ppsfreq; tx32.jitter = txc->jitter; tx32.shift = txc->shift; tx32.stabil = txc->stabil; tx32.jitcnt = txc->jitcnt; tx32.calcnt = txc->calcnt; tx32.errcnt = txc->errcnt; tx32.stbcnt = txc->stbcnt; tx32.tai = txc->tai; if (copy_to_user(utp, &tx32, sizeof(struct old_timex32))) return -EFAULT; return 0; } SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) { struct __kernel_timex txc; int err, ret; err = get_old_timex32(&txc, utp); if (err) return err; ret = do_adjtimex(&txc); err = put_old_timex32(utp, &txc); if (err) return err; return ret; } #endif /** * jiffies_to_msecs - Convert jiffies to milliseconds * @j: jiffies value * * Avoid unnecessary multiplications/divisions in the * two most common HZ cases. * * Return: milliseconds value */ unsigned int jiffies_to_msecs(const unsigned long j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else # if BITS_PER_LONG == 32 return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >> HZ_TO_MSEC_SHR32; # else return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); # endif #endif } EXPORT_SYMBOL(jiffies_to_msecs); /** * jiffies_to_usecs - Convert jiffies to microseconds * @j: jiffies value * * Return: microseconds value */ unsigned int jiffies_to_usecs(const unsigned long j) { /* * Hz usually doesn't go much further MSEC_PER_SEC. * jiffies_to_usecs() and usecs_to_jiffies() depend on that. */ BUILD_BUG_ON(HZ > USEC_PER_SEC); #if !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; #else # if BITS_PER_LONG == 32 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; # else return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; # endif #endif } EXPORT_SYMBOL(jiffies_to_usecs); /** * mktime64 - Converts date to seconds. * @year0: year to convert * @mon0: month to convert * @day: day to convert * @hour: hour to convert * @min: minute to convert * @sec: second to convert * * Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. * * [For the Julian calendar (which was used in Russia before 1917, * Britain & colonies before 1752, anywhere else before 1582, * and is still in use by some communities) leave out the * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). * * A leap second can be indicated by calling this function with sec as * 60 (allowable under ISO 8601). The leap second is treated the same * as the following second since they don't exist in UNIX time. * * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight * tomorrow - (allowable under ISO 8601) is supported. * * Return: seconds since the epoch time for the given input date */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec) { unsigned int mon = mon0, year = year0; /* 1..12 -> 11,12,1..10 */ if (0 >= (int) (mon -= 2)) { mon += 12; /* Puts Feb last since it has leap day */ year -= 1; } return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 )*24 + hour /* now have hours - midnight tomorrow handled here */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } EXPORT_SYMBOL(mktime64); struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); struct __kernel_old_timeval tv; tv.tv_sec = ts.tv_sec; tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000; return tv; } EXPORT_SYMBOL(ns_to_kernel_old_timeval); /** * set_normalized_timespec64 - set timespec sec and nsec parts and normalize * * @ts: pointer to timespec variable to be set * @sec: seconds to set * @nsec: nanoseconds to set * * Set seconds and nanoseconds field of a timespec variable and * normalize to the timespec storage format * * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC. * For negative values only the tv_sec field is negative ! */ void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) { while (nsec >= NSEC_PER_SEC) { /* * The following asm() prevents the compiler from * optimising this loop into a modulo operation. See * also __iter_div_u64_rem() in include/linux/time.h */ asm("" : "+rm"(nsec)); nsec -= NSEC_PER_SEC; ++sec; } while (nsec < 0) { asm("" : "+rm"(nsec)); nsec += NSEC_PER_SEC; --sec; } ts->tv_sec = sec; ts->tv_nsec = nsec; } EXPORT_SYMBOL(set_normalized_timespec64); /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Return: the timespec64 representation of the nsec parameter. */ struct timespec64 ns_to_timespec64(s64 nsec) { struct timespec64 ts = { 0, 0 }; s32 rem; if (likely(nsec > 0)) { ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); ts.tv_nsec = rem; } else if (nsec < 0) { /* * With negative times, tv_sec points to the earlier * second, and tv_nsec counts the nanoseconds since * then, so tv_nsec is always a positive number. */ ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1; ts.tv_nsec = NSEC_PER_SEC - rem - 1; } return ts; } EXPORT_SYMBOL(ns_to_timespec64); /** * __msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. * for the details see _msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code, __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The _msecs_to_jiffies helpers are the HZ dependent conversion * routines found in include/linux/jiffies.h * * Return: jiffies value */ unsigned long __msecs_to_jiffies(const unsigned int m) { /* * Negative value, means infinite timeout: */ if ((int)m < 0) return MAX_JIFFY_OFFSET; return _msecs_to_jiffies(m); } EXPORT_SYMBOL(__msecs_to_jiffies); /** * __usecs_to_jiffies: - convert microseconds to jiffies * @u: time in milliseconds * * Return: jiffies value */ unsigned long __usecs_to_jiffies(const unsigned int u) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return _usecs_to_jiffies(u); } EXPORT_SYMBOL(__usecs_to_jiffies); /** * timespec64_to_jiffies - convert a timespec64 value to jiffies * @value: pointer to &struct timespec64 * * The TICK_NSEC - 1 rounds up the value to the next resolution. Note * that a remainder subtract here would not do the right thing as the * resolution values don't fall on second boundaries. I.e. the line: * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. * Note that due to the small error in the multiplier here, this * rounding is incorrect for sufficiently large values of tv_nsec, but * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're * OK. * * Rather, we just shift the bits off the right. * * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. * * Return: jiffies value */ unsigned long timespec64_to_jiffies(const struct timespec64 *value) { u64 sec = value->tv_sec; long nsec = value->tv_nsec + TICK_NSEC - 1; if (sec >= MAX_SEC_IN_JIFFIES){ sec = MAX_SEC_IN_JIFFIES; nsec = 0; } return ((sec * SEC_CONVERSION) + (((u64)nsec * NSEC_CONVERSION) >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } EXPORT_SYMBOL(timespec64_to_jiffies); /** * jiffies_to_timespec64 - convert jiffies value to &struct timespec64 * @jiffies: jiffies value * @value: pointer to &struct timespec64 */ void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) { /* * Convert jiffies to nanoseconds and separate with * one divide. */ u32 rem; value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, NSEC_PER_SEC, &rem); value->tv_nsec = rem; } EXPORT_SYMBOL(jiffies_to_timespec64); /* * Convert jiffies/jiffies_64 to clock_t and back. */ /** * jiffies_to_clock_t - Convert jiffies to clock_t * @x: jiffies value * * Return: jiffies converted to clock_t (CLOCKS_PER_SEC) */ clock_t jiffies_to_clock_t(unsigned long x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ return x * (USER_HZ / HZ); # else return x / (HZ / USER_HZ); # endif #else return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); #endif } EXPORT_SYMBOL(jiffies_to_clock_t); /** * clock_t_to_jiffies - Convert clock_t to jiffies * @x: clock_t value * * Return: clock_t value converted to jiffies */ unsigned long clock_t_to_jiffies(unsigned long x) { #if (HZ % USER_HZ)==0 if (x >= ~0UL / (HZ / USER_HZ)) return ~0UL; return x * (HZ / USER_HZ); #else /* Don't worry about loss of precision here .. */ if (x >= ~0UL / HZ * USER_HZ) return ~0UL; /* .. but do try to contain it here */ return div_u64((u64)x * HZ, USER_HZ); #endif } EXPORT_SYMBOL(clock_t_to_jiffies); /** * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t * @x: jiffies_64 value * * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) */ u64 jiffies_64_to_clock_t(u64 x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ x = div_u64(x * USER_HZ, HZ); # elif HZ > USER_HZ x = div_u64(x, HZ / USER_HZ); # else /* Nothing to do */ # endif #else /* * There are better ways that don't overflow early, * but even this doesn't overflow in hundreds of years * in 64 bits, so.. */ x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); #endif return x; } EXPORT_SYMBOL(jiffies_64_to_clock_t); /** * nsec_to_clock_t - Convert nsec value to clock_t * @x: nsec value * * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) */ u64 nsec_to_clock_t(u64 x) { #if (NSEC_PER_SEC % USER_HZ) == 0 return div_u64(x, NSEC_PER_SEC / USER_HZ); #elif (USER_HZ % 512) == 0 return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); #else /* * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, * overflow after 64.99 years. * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... */ return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); #endif } /** * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds * @j: jiffies64 value * * Return: nanoseconds value */ u64 jiffies64_to_nsecs(u64 j) { #if !(NSEC_PER_SEC % HZ) return (NSEC_PER_SEC / HZ) * j; # else return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN); #endif } EXPORT_SYMBOL(jiffies64_to_nsecs); /** * jiffies64_to_msecs - Convert jiffies64 to milliseconds * @j: jiffies64 value * * Return: milliseconds value */ u64 jiffies64_to_msecs(const u64 j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; #else return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); #endif } EXPORT_SYMBOL(jiffies64_to_msecs); /** * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * * @n: nsecs in u64 * * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. * And this doesn't return MAX_JIFFY_OFFSET since this function is designed * for scheduler, not for use in device drivers to calculate timeout value. * * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years * * Return: nsecs converted to jiffies64 value */ u64 nsecs_to_jiffies64(u64 n) { #if (NSEC_PER_SEC % HZ) == 0 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ return div_u64(n, NSEC_PER_SEC / HZ); #elif (HZ % 512) == 0 /* overflow after 292 years if HZ = 1024 */ return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); #else /* * Generic case - optimized for cases where HZ is a multiple of 3. * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. */ return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); #endif } EXPORT_SYMBOL(nsecs_to_jiffies64); /** * nsecs_to_jiffies - Convert nsecs in u64 to jiffies * * @n: nsecs in u64 * * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. * And this doesn't return MAX_JIFFY_OFFSET since this function is designed * for scheduler, not for use in device drivers to calculate timeout value. * * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years * * Return: nsecs converted to jiffies value */ unsigned long nsecs_to_jiffies(u64 n) { return (unsigned long)nsecs_to_jiffies64(n); } EXPORT_SYMBOL_GPL(nsecs_to_jiffies); /** * timespec64_add_safe - Add two timespec64 values and do a safety check * for overflow. * @lhs: first (left) timespec64 to add * @rhs: second (right) timespec64 to add * * It's assumed that both values are valid (>= 0). * And, each timespec64 is in normalized form. * * Return: sum of @lhs + @rhs */ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs) { struct timespec64 res; set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { res.tv_sec = TIME64_MAX; res.tv_nsec = 0; } return res; } EXPORT_SYMBOL_GPL(timespec64_add_safe); /** * get_timespec64 - get user's time value into kernel space * @ts: destination &struct timespec64 * @uts: user's time value as &struct __kernel_timespec * * Handles compat or 32-bit modes. * * Return: 0 on success or negative errno on error */ int get_timespec64(struct timespec64 *ts, const struct __kernel_timespec __user *uts) { struct __kernel_timespec kts; int ret; ret = copy_from_user(&kts, uts, sizeof(kts)); if (ret) return -EFAULT; ts->tv_sec = kts.tv_sec; /* Zero out the padding in compat mode */ if (in_compat_syscall()) kts.tv_nsec &= 0xFFFFFFFFUL; /* In 32-bit mode, this drops the padding */ ts->tv_nsec = kts.tv_nsec; return 0; } EXPORT_SYMBOL_GPL(get_timespec64); /** * put_timespec64 - convert timespec64 value to __kernel_timespec format and * copy the latter to userspace * @ts: input &struct timespec64 * @uts: user's &struct __kernel_timespec * * Return: 0 on success or negative errno on error */ int put_timespec64(const struct timespec64 *ts, struct __kernel_timespec __user *uts) { struct __kernel_timespec kts = { .tv_sec = ts->tv_sec, .tv_nsec = ts->tv_nsec }; return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(put_timespec64); static int __get_old_timespec32(struct timespec64 *ts64, const struct old_timespec32 __user *cts) { struct old_timespec32 ts; int ret; ret = copy_from_user(&ts, cts, sizeof(ts)); if (ret) return -EFAULT; ts64->tv_sec = ts.tv_sec; ts64->tv_nsec = ts.tv_nsec; return 0; } static int __put_old_timespec32(const struct timespec64 *ts64, struct old_timespec32 __user *cts) { struct old_timespec32 ts = { .tv_sec = ts64->tv_sec, .tv_nsec = ts64->tv_nsec }; return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; } /** * get_old_timespec32 - get user's old-format time value into kernel space * @ts: destination &struct timespec64 * @uts: user's old-format time value (&struct old_timespec32) * * Handles X86_X32_ABI compatibility conversion. * * Return: 0 on success or negative errno on error */ int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else return __get_old_timespec32(ts, uts); } EXPORT_SYMBOL_GPL(get_old_timespec32); /** * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and * copy the latter to userspace * @ts: input &struct timespec64 * @uts: user's &struct old_timespec32 * * Handles X86_X32_ABI compatibility conversion. * * Return: 0 on success or negative errno on error */ int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else return __put_old_timespec32(ts, uts); } EXPORT_SYMBOL_GPL(put_old_timespec32); /** * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space * @it: destination &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * * Return: 0 on success or negative errno on error */ int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) { int ret; ret = get_timespec64(&it->it_interval, &uit->it_interval); if (ret) return ret; ret = get_timespec64(&it->it_value, &uit->it_value); return ret; } EXPORT_SYMBOL_GPL(get_itimerspec64); /** * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format * and copy the latter to userspace * @it: input &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * * Return: 0 on success or negative errno on error */ int put_itimerspec64(const struct itimerspec64 *it, struct __kernel_itimerspec __user *uit) { int ret; ret = put_timespec64(&it->it_interval, &uit->it_interval); if (ret) return ret; ret = put_timespec64(&it->it_value, &uit->it_value); return ret; } EXPORT_SYMBOL_GPL(put_itimerspec64); /** * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space * @its: destination &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * * Return: 0 on success or negative errno on error */ int get_old_itimerspec32(struct itimerspec64 *its, const struct old_itimerspec32 __user *uits) { if (__get_old_timespec32(&its->it_interval, &uits->it_interval) || __get_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(get_old_itimerspec32); /** * put_old_itimerspec32 - convert &struct itimerspec64 to &struct * old_itimerspec32 and copy the latter to userspace * @its: input &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * * Return: 0 on success or negative errno on error */ int put_old_itimerspec32(const struct itimerspec64 *its, struct old_itimerspec32 __user *uits) { if (__put_old_timespec32(&its->it_interval, &uits->it_interval) || __put_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(put_old_itimerspec32);
1 12 12 12 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 // SPDX-License-Identifier: GPL-2.0 #ifndef IOU_RSRC_H #define IOU_RSRC_H #include <linux/io_uring_types.h> #include <linux/lockdep.h> #define IO_VEC_CACHE_SOFT_CAP 256 enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, }; struct io_rsrc_node { unsigned char type; int refs; u64 tag; union { unsigned long file_ptr; struct io_mapped_ubuf *buf; }; }; enum { IO_IMU_DEST = 1 << ITER_DEST, IO_IMU_SOURCE = 1 << ITER_SOURCE, }; struct io_mapped_ubuf { u64 ubuf; unsigned int len; unsigned int nr_bvecs; unsigned int folio_shift; refcount_t refs; unsigned long acct_pages; void (*release)(void *); void *priv; bool is_kbuf; u8 dir; struct bio_vec bvec[] __counted_by(nr_bvecs); }; struct io_imu_folio_data { /* Head folio can be partially included in the fixed buf */ unsigned int nr_pages_head; /* For non-head/tail folios, has to be fully included */ unsigned int nr_pages_mid; unsigned int folio_shift; unsigned int nr_folios; unsigned long first_folio_page_idx; }; bool io_rsrc_cache_init(struct io_ring_ctx *ctx); void io_rsrc_cache_free(struct io_ring_ctx *ctx); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, unsigned issue_flags); int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, u64 buf_addr, size_t len, int ddir, unsigned issue_flags); int io_import_reg_vec(int ddir, struct iov_iter *iter, struct io_kiocb *req, struct iou_vec *vec, unsigned nr_iovs, unsigned issue_flags); int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, const struct iovec __user *uvec, size_t uvec_segs); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags); int io_sqe_files_unregister(struct io_ring_ctx *ctx); int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags); int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); int io_validate_user_buf_range(u64 uaddr, u64 ulen); bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data); static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, int index) { if (index < data->nr) return data->nodes[array_index_nospec(index, data->nr)]; return NULL; } static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { lockdep_assert_held(&ctx->uring_lock); if (!--node->refs) io_free_rsrc_node(ctx, node); } static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_data *data, int index) { struct io_rsrc_node *node = data->nodes[index]; if (!node) return false; io_put_rsrc_node(ctx, node); data->nodes[index] = NULL; return true; } int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int __io_account_mem(struct user_struct *user, unsigned long nr_pages); int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { atomic_long_sub(nr_pages, &user->locked_vm); } void io_vec_free(struct iou_vec *iv); int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries); static inline void io_vec_reset_iovec(struct iou_vec *iv, struct iovec *iovec, unsigned nr) { io_vec_free(iv); iv->iovec = iovec; iv->nr = nr; } static inline void io_alloc_cache_vec_kasan(struct iou_vec *iv) { if (IS_ENABLED(CONFIG_KASAN)) io_vec_free(iv); } #endif
22 15 22 6 17 17 113 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 // SPDX-License-Identifier: MIT #include <linux/export.h> #include <linux/fb.h> #include <drm/drm_drv.h> #include <drm/drm_fbdev_shmem.h> #include <drm/drm_fb_helper.h> #include <drm/drm_framebuffer.h> #include <drm/drm_gem_framebuffer_helper.h> #include <drm/drm_gem_shmem_helper.h> /* * struct fb_ops */ static int drm_fbdev_shmem_fb_open(struct fb_info *info, int user) { struct drm_fb_helper *fb_helper = info->par; /* No need to take a ref for fbcon because it unbinds on unregister */ if (user && !try_module_get(fb_helper->dev->driver->fops->owner)) return -ENODEV; return 0; } static int drm_fbdev_shmem_fb_release(struct fb_info *info, int user) { struct drm_fb_helper *fb_helper = info->par; if (user) module_put(fb_helper->dev->driver->fops->owner); return 0; } FB_GEN_DEFAULT_DEFERRED_SYSMEM_OPS(drm_fbdev_shmem, drm_fb_helper_damage_range, drm_fb_helper_damage_area); static int drm_fbdev_shmem_fb_mmap(struct fb_info *info, struct vm_area_struct *vma) { struct drm_fb_helper *fb_helper = info->par; struct drm_framebuffer *fb = fb_helper->fb; struct drm_gem_object *obj = drm_gem_fb_get_obj(fb, 0); struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj); if (shmem->map_wc) vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); return fb_deferred_io_mmap(info, vma); } static void drm_fbdev_shmem_fb_destroy(struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; if (!fb_helper->dev) return; fb_deferred_io_cleanup(info); drm_fb_helper_fini(fb_helper); drm_client_buffer_vunmap(fb_helper->buffer); drm_client_framebuffer_delete(fb_helper->buffer); drm_client_release(&fb_helper->client); drm_fb_helper_unprepare(fb_helper); kfree(fb_helper); } static const struct fb_ops drm_fbdev_shmem_fb_ops = { .owner = THIS_MODULE, .fb_open = drm_fbdev_shmem_fb_open, .fb_release = drm_fbdev_shmem_fb_release, __FB_DEFAULT_DEFERRED_OPS_RDWR(drm_fbdev_shmem), DRM_FB_HELPER_DEFAULT_OPS, __FB_DEFAULT_DEFERRED_OPS_DRAW(drm_fbdev_shmem), .fb_mmap = drm_fbdev_shmem_fb_mmap, .fb_destroy = drm_fbdev_shmem_fb_destroy, }; static struct page *drm_fbdev_shmem_get_page(struct fb_info *info, unsigned long offset) { struct drm_fb_helper *fb_helper = info->par; struct drm_framebuffer *fb = fb_helper->fb; struct drm_gem_object *obj = drm_gem_fb_get_obj(fb, 0); struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj); unsigned int i = offset >> PAGE_SHIFT; struct page *page; if (fb_WARN_ON_ONCE(info, offset > obj->size)) return NULL; page = shmem->pages[i]; // protected by active vmap if (page) get_page(page); fb_WARN_ON_ONCE(info, !page); return page; } /* * struct drm_fb_helper */ static int drm_fbdev_shmem_helper_fb_dirty(struct drm_fb_helper *helper, struct drm_clip_rect *clip) { struct drm_device *dev = helper->dev; int ret; /* Call damage handlers only if necessary */ if (!(clip->x1 < clip->x2 && clip->y1 < clip->y2)) return 0; if (helper->fb->funcs->dirty) { ret = helper->fb->funcs->dirty(helper->fb, NULL, 0, 0, clip, 1); if (drm_WARN_ONCE(dev, ret, "Dirty helper failed: ret=%d\n", ret)) return ret; } return 0; } static const struct drm_fb_helper_funcs drm_fbdev_shmem_helper_funcs = { .fb_dirty = drm_fbdev_shmem_helper_fb_dirty, }; /* * struct drm_driver */ int drm_fbdev_shmem_driver_fbdev_probe(struct drm_fb_helper *fb_helper, struct drm_fb_helper_surface_size *sizes) { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; struct drm_client_buffer *buffer; struct drm_gem_shmem_object *shmem; struct drm_framebuffer *fb; struct fb_info *info; u32 format; struct iosys_map map; int ret; drm_dbg_kms(dev, "surface width(%d), height(%d) and bpp(%d)\n", sizes->surface_width, sizes->surface_height, sizes->surface_bpp); format = drm_driver_legacy_fb_format(dev, sizes->surface_bpp, sizes->surface_depth); buffer = drm_client_framebuffer_create(client, sizes->surface_width, sizes->surface_height, format); if (IS_ERR(buffer)) return PTR_ERR(buffer); shmem = to_drm_gem_shmem_obj(buffer->gem); fb = buffer->fb; ret = drm_client_buffer_vmap(buffer, &map); if (ret) { goto err_drm_client_buffer_delete; } else if (drm_WARN_ON(dev, map.is_iomem)) { ret = -ENODEV; /* I/O memory not supported; use generic emulation */ goto err_drm_client_buffer_delete; } fb_helper->funcs = &drm_fbdev_shmem_helper_funcs; fb_helper->buffer = buffer; fb_helper->fb = fb; info = drm_fb_helper_alloc_info(fb_helper); if (IS_ERR(info)) { ret = PTR_ERR(info); goto err_drm_client_buffer_vunmap; } drm_fb_helper_fill_info(info, fb_helper, sizes); info->fbops = &drm_fbdev_shmem_fb_ops; /* screen */ info->flags |= FBINFO_VIRTFB; /* system memory */ if (!shmem->map_wc) info->flags |= FBINFO_READS_FAST; /* signal caching */ info->screen_size = sizes->surface_height * fb->pitches[0]; info->screen_buffer = map.vaddr; info->fix.smem_len = info->screen_size; /* deferred I/O */ fb_helper->fbdefio.delay = HZ / 20; fb_helper->fbdefio.get_page = drm_fbdev_shmem_get_page; fb_helper->fbdefio.deferred_io = drm_fb_helper_deferred_io; info->fbdefio = &fb_helper->fbdefio; ret = fb_deferred_io_init(info); if (ret) goto err_drm_fb_helper_release_info; return 0; err_drm_fb_helper_release_info: drm_fb_helper_release_info(fb_helper); err_drm_client_buffer_vunmap: fb_helper->fb = NULL; fb_helper->buffer = NULL; drm_client_buffer_vunmap(buffer); err_drm_client_buffer_delete: drm_client_framebuffer_delete(buffer); return ret; } EXPORT_SYMBOL(drm_fbdev_shmem_driver_fbdev_probe);
21016 702 268 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PREEMPT_H #define __LINUX_PREEMPT_H /* * include/linux/preempt.h - macros for accessing and manipulating * preempt_count (used for kernel preemption, interrupt count, etc.) */ #include <linux/linkage.h> #include <linux/cleanup.h> #include <linux/types.h> /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: * * - bits 0-7 are the preemption count (max preemption depth: 256) * - bits 8-15 are the softirq count (max # of softirqs: 256) * * The hardirq count could in theory be the same as the number of * interrupts in the system, but we run all interrupt handlers with * interrupts disabled, so we cannot have nesting interrupts. Though * there are a few palaeontologic drivers which reenable interrupts in * the handler, so we need more than one bit here. * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 * NMI_MASK: 0x00f00000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 #define HARDIRQ_BITS 4 #define NMI_BITS 4 #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) #define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) #define __IRQ_MASK(x) ((1UL << (x))-1) #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) #define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* * Disable preemption until the scheduler is running -- use an unconditional * value so that it also works on !PREEMPT_COUNT kernels. * * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). */ #define INIT_PREEMPT_COUNT PREEMPT_OFFSET /* * Initial preempt_count value; reflects the preempt_count schedule invariant * which states that during context switches: * * preempt_count() == 2*PREEMPT_DISABLE_OFFSET * * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. * Note: See finish_task_switch(). */ #define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include <asm/preempt.h> /** * interrupt_context_level - return interrupt context level * * Returns the current interrupt context level. * 0 - normal context * 1 - softirq context * 2 - hardirq context * 3 - NMI context */ static __always_inline unsigned char interrupt_context_level(void) { unsigned long pc = preempt_count(); unsigned char level = 0; level += !!(pc & (NMI_MASK)); level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); return level; } /* * These macro definitions avoid redundant invocations of preempt_count() * because such invocations would result in redundant loads given that * preempt_count() is commonly implemented with READ_ONCE(). */ #define nmi_count() (preempt_count() & NMI_MASK) #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #ifdef CONFIG_PREEMPT_RT # define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK) # define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count()) #else # define softirq_count() (preempt_count() & SOFTIRQ_MASK) # define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK)) #endif /* * Macros to retrieve the current execution context: * * in_nmi() - We're in NMI context * in_hardirq() - We're in hard IRQ context * in_serving_softirq() - We're in softirq context * in_task() - We're in task context */ #define in_nmi() (nmi_count()) #define in_hardirq() (hardirq_count()) #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) #ifdef CONFIG_PREEMPT_RT # define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq())) #else # define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) #endif /* * The following macros are deprecated and should not be used in new code: * in_irq() - Obsolete version of in_hardirq() * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled */ #define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) /* * The preempt_count offset after preempt_disable(); */ #if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET #else # define PREEMPT_DISABLE_OFFSET 0 #endif /* * The preempt_count offset after spin_lock() */ #if !defined(CONFIG_PREEMPT_RT) #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET #else /* Locks on RT do not disable preemption */ #define PREEMPT_LOCK_OFFSET 0 #endif /* * The preempt_count offset needed for things like: * * spin_lock_bh() * * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and * softirqs, such that unlock sequences of: * * spin_unlock(); * local_bh_enable(); * * Work as expected. */ #define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET) /* * Are we running in atomic context? WARNING: this macro cannot * always detect atomic context; in particular, it cannot know about * held spinlocks in non-preemptible kernels. Thus it should not be * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ #define in_atomic() (preempt_count() != 0) /* * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler) */ #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); #define preempt_count_dec_and_test() \ ({ preempt_count_sub(1); should_resched(0); }) #else #define preempt_count_add(val) __preempt_count_add(val) #define preempt_count_sub(val) __preempt_count_sub(val) #define preempt_count_dec_and_test() __preempt_count_dec_and_test() #endif #define __preempt_count_inc() __preempt_count_add(1) #define __preempt_count_dec() __preempt_count_sub(1) #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ do { \ preempt_count_inc(); \ barrier(); \ } while (0) #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_no_resched() sched_preempt_enable_no_resched() #define preemptible() (preempt_count() == 0 && !irqs_disabled()) #ifdef CONFIG_PREEMPTION #define preempt_enable() \ do { \ barrier(); \ if (unlikely(preempt_count_dec_and_test())) \ __preempt_schedule(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ if (unlikely(__preempt_count_dec_and_test())) \ __preempt_schedule_notrace(); \ } while (0) #define preempt_check_resched() \ do { \ if (should_resched(0)) \ __preempt_schedule(); \ } while (0) #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #define preempt_check_resched() do { } while (0) #endif /* CONFIG_PREEMPTION */ #define preempt_disable_notrace() \ do { \ __preempt_count_inc(); \ barrier(); \ } while (0) #define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #else /* !CONFIG_PREEMPT_COUNT */ /* * Even if we don't have any preemption, we need preempt disable/enable * to be barriers, so that we don't have things like get_user/put_user * that can cause faults and scheduling migrate into our preempt-protected * region. */ #define preempt_disable() barrier() #define sched_preempt_enable_no_resched() barrier() #define preempt_enable_no_resched() barrier() #define preempt_enable() barrier() #define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() #define preemptible() 0 #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE /* * Modules have no business playing preemption tricks. */ #undef sched_preempt_enable_no_resched #undef preempt_enable_no_resched #undef preempt_enable_no_resched_notrace #undef preempt_check_resched #endif #define preempt_set_need_resched() \ do { \ set_preempt_need_resched(); \ } while (0) #define preempt_fold_need_resched() \ do { \ if (tif_need_resched()) \ set_preempt_need_resched(); \ } while (0) #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; struct task_struct; /** * preempt_ops - notifiers called when a task is preempted and rescheduled * @sched_in: we're about to be rescheduled: * notifier: struct preempt_notifier for the task being scheduled * cpu: cpu we're scheduled on * @sched_out: we've just been preempted * notifier: struct preempt_notifier for the task being preempted * next: the task that's kicking us out * * Please note that sched_in and out are called under different * contexts. sched_out is called with rq lock held and irq disabled * while sched_in is called without rq lock and irq enabled. This * difference is intentional and depended upon by its users. */ struct preempt_ops { void (*sched_in)(struct preempt_notifier *notifier, int cpu); void (*sched_out)(struct preempt_notifier *notifier, struct task_struct *next); }; /** * preempt_notifier - key for installing preemption notifiers * @link: internal use * @ops: defines the notifier functions to be called * * Usually used in conjunction with container_of(). */ struct preempt_notifier { struct hlist_node link; struct preempt_ops *ops; }; void preempt_notifier_inc(void); void preempt_notifier_dec(void); void preempt_notifier_register(struct preempt_notifier *notifier); void preempt_notifier_unregister(struct preempt_notifier *notifier); static inline void preempt_notifier_init(struct preempt_notifier *notifier, struct preempt_ops *ops) { /* INIT_HLIST_NODE() open coded, to avoid dependency on list.h */ notifier->link.next = NULL; notifier->link.pprev = NULL; notifier->ops = ops; } #endif /* * Migrate-Disable and why it is undesired. * * When a preempted task becomes eligible to run under the ideal model (IOW it * becomes one of the M highest priority tasks), it might still have to wait * for the preemptee's migrate_disable() section to complete. Thereby suffering * a reduction in bandwidth in the exact duration of the migrate_disable() * section. * * Per this argument, the change from preempt_disable() to migrate_disable() * gets us: * * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() * it would have had to wait for the lower priority task. * * - a lower priority tasks; which under preempt_disable() could've instantly * migrated away when another CPU becomes available, is now constrained * by the ability to push the higher priority task away, which might itself be * in a migrate_disable() section, reducing its available bandwidth. * * IOW it trades latency / moves the interference term, but it stays in the * system, and as long as it remains unbounded, the system is not fully * deterministic. * * * The reason we have it anyway. * * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a * number of primitives into becoming preemptible, they would also allow * migration. This turns out to break a bunch of per-cpu usage. To this end, * all these primitives employ migrate_disable() to restore this implicit * assumption. * * This is a 'temporary' work-around at best. The correct solution is getting * rid of the above assumptions and reworking the code to employ explicit * per-cpu locking or short preempt-disable regions. * * The end goal must be to get rid of migrate_disable(), alternatively we need * a schedulability theory that does not depend on arbitrary migration. * * * Notes on the implementation. * * The implementation is particularly tricky since existing code patterns * dictate neither migrate_disable() nor migrate_enable() is allowed to block. * This means that it cannot use cpus_read_lock() to serialize against hotplug, * nor can it easily migrate itself into a pending affinity mask change on * migrate_enable(). * * * Note: even non-work-conserving schedulers like semi-partitioned depends on * migration, so migrate_disable() is not only a problem for * work-conserving schedulers. * */ /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section * * Use for code which requires preemption protection inside a critical * section which has preemption disabled implicitly on non-PREEMPT_RT * enabled kernels, by e.g.: * - holding a spinlock/rwlock * - soft interrupt context * - regular interrupt handlers * * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft * interrupt context and regular interrupt handlers are preemptible and * only prevent migration. preempt_disable_nested() ensures that preemption * is disabled for cases which require CPU local serialization even on * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP. * * The use cases are code sequences which are not serialized by a * particular lock instance, e.g.: * - seqcount write side critical sections where the seqcount is not * associated to a particular lock and therefore the automatic * protection mechanism does not work. This prevents a live lock * against a preempting high priority reader. * - RMW per CPU variable updates like vmstat. */ /* Macro to avoid header recursion hell vs. lockdep */ #define preempt_disable_nested() \ do { \ if (IS_ENABLED(CONFIG_PREEMPT_RT)) \ preempt_disable(); \ else \ lockdep_assert_preemption_disabled(); \ } while (0) /** * preempt_enable_nested - Undo the effect of preempt_disable_nested() */ static __always_inline void preempt_enable_nested(void) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); } DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) #ifdef CONFIG_PREEMPT_DYNAMIC extern bool preempt_model_none(void); extern bool preempt_model_voluntary(void); extern bool preempt_model_full(void); extern bool preempt_model_lazy(void); #else static inline bool preempt_model_none(void) { return IS_ENABLED(CONFIG_PREEMPT_NONE); } static inline bool preempt_model_voluntary(void) { return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); } static inline bool preempt_model_full(void) { return IS_ENABLED(CONFIG_PREEMPT); } static inline bool preempt_model_lazy(void) { return IS_ENABLED(CONFIG_PREEMPT_LAZY); } #endif static inline bool preempt_model_rt(void) { return IS_ENABLED(CONFIG_PREEMPT_RT); } extern const char *preempt_model_str(void); /* * Does the preemption model allow non-cooperative preemption? * * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the * PREEMPT_NONE model. */ static inline bool preempt_model_preemptible(void) { return preempt_model_full() || preempt_model_lazy() || preempt_model_rt(); } #endif /* __LINUX_PREEMPT_H */
27 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 /* SPDX-License-Identifier: GPL-2.0 * * FUSE: Filesystem in Userspace * Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> */ #ifndef _FS_FUSE_DEV_I_H #define _FS_FUSE_DEV_I_H #include <linux/types.h> /* Ordinary requests have even IDs, while interrupts IDs are odd */ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) extern struct wait_queue_head fuse_dev_waitq; struct fuse_arg; struct fuse_args; struct fuse_pqueue; struct fuse_req; struct fuse_iqueue; struct fuse_forget_link; struct fuse_copy_state { struct fuse_req *req; struct iov_iter *iter; struct pipe_buffer *pipebufs; struct pipe_buffer *currbuf; struct pipe_inode_info *pipe; unsigned long nr_segs; struct page *pg; unsigned int len; unsigned int offset; bool write:1; bool move_folios:1; bool is_uring:1; struct { unsigned int copied_sz; /* copied size into the user buffer */ } ring; }; #define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1) #define FUSE_DEV_PTR_MASK (~1UL) static inline struct fuse_dev *__fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ struct fuse_dev *fud = READ_ONCE(file->private_data); return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK); } struct fuse_dev *fuse_get_dev(struct file *file); unsigned int fuse_req_hash(u64 unique); struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); void fuse_dev_end_requests(struct list_head *head); void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter); int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, unsigned int argpages, struct fuse_arg *args, int zeroing); int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, unsigned int nbytes); void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget); void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock); bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list); #endif
70 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_CHECKSUM_H #define __ASM_GENERIC_CHECKSUM_H #include <linux/bitops.h> /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * * returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic * * this function must be called with even lengths, except * for the last fragment, which may be odd * * it's best to have buff aligned on a 32-bit boundary */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); #ifndef ip_fast_csum /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); #endif #ifndef csum_fold /* * Fold a partial checksum */ static inline __sum16 csum_fold(__wsum csum) { u32 sum = (__force u32)csum; return (__force __sum16)((~sum - ror32(sum, 16)) >> 16); } #endif #ifndef csum_tcpudp_nofold /* * computes the checksum of the TCP/UDP pseudo-header * returns a 16-bit checksum, already complemented */ extern __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum); #endif #ifndef csum_tcpudp_magic static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } #endif /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ extern __sum16 ip_compute_csum(const void *buff, int len); #endif /* __ASM_GENERIC_CHECKSUM_H */
7 3 23 169 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* SPDX-License-Identifier: GPL-2.0-only */ /* * irq.h: in kernel interrupt controller related definitions * Copyright (c) 2007, Intel Corporation. * * Authors: * Yaozu (Eddie) Dong <Eddie.dong@intel.com> */ #ifndef __IRQ_H #define __IRQ_H #include <linux/mm_types.h> #include <linux/hrtimer.h> #include <linux/kvm_host.h> #include <linux/spinlock.h> #include <kvm/iodev.h> #include "lapic.h" #ifdef CONFIG_KVM_IOAPIC #define PIC_NUM_PINS 16 #define SELECT_PIC(irq) \ ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) struct kvm; struct kvm_vcpu; struct kvm_kpic_state { u8 last_irr; /* edge detection */ u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ u8 priority_add; /* highest irq priority */ u8 irq_base; u8 read_reg_select; u8 poll; u8 special_mask; u8 init_state; u8 auto_eoi; u8 rotate_on_auto_eoi; u8 special_fully_nested_mode; u8 init4; /* true if 4 byte init */ u8 elcr; /* PIIX edge/trigger selection */ u8 elcr_mask; u8 isr_ack; /* interrupt ack detection */ struct kvm_pic *pics_state; }; struct kvm_pic { spinlock_t lock; bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ int output; /* intr from master PIC */ struct kvm_io_device dev_master; struct kvm_io_device dev_slave; struct kvm_io_device dev_elcr; unsigned long irq_states[PIC_NUM_PINS]; }; int kvm_pic_init(struct kvm *kvm); void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status); int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm); int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); static inline int irqchip_full(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; /* Matches smp_wmb() when setting irqchip_mode */ smp_rmb(); return mode == KVM_IRQCHIP_KERNEL; } #else /* CONFIG_KVM_IOAPIC */ static __always_inline int irqchip_full(struct kvm *kvm) { return false; } #endif static inline int pic_in_kernel(struct kvm *kvm) { return irqchip_full(kvm); } static inline int irqchip_split(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; /* Matches smp_wmb() when setting irqchip_mode */ smp_rmb(); return mode == KVM_IRQCHIP_SPLIT; } static inline int irqchip_in_kernel(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; /* Matches smp_wmb() when setting irqchip_mode */ smp_rmb(); return mode != KVM_IRQCHIP_NONE; } void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); #endif
4270 1214 2 2 1 1 3 3 3 3 3 3 347 3109 3094 1 3097 3098 3098 3109 3204 3106 3109 3098 4090 4075 4074 3209 4089 3172 3173 5 5 2 5 695 581 696 692 3247 1 1 1 3248 3239 4073 4077 224 1268 224 224 224 224 1268 1264 372 21 2111 24 2111 3165 3462 3821 3179 3191 3030 3175 3176 3175 3184 3185 34 34 34 33 24 24 33 3027 3016 3013 3029 3030 2 2 3014 3007 947 947 612 947 7 7 947 869 946 975 975 974 668 665 665 3 3 662 661 665 665 664 662 661 668 1 946 947 856 947 865 4 4 947 870 870 947 592 593 945 19 13 19 946 19 19 19 945 946 2711 2706 2703 2714 2721 144 144 143 2713 2710 2714 1263 1917 1914 1930 448 771 2 24 28 2 23 17 28 35 35 33 32 34 6 2 1 6 26 25 23 20 19 22 20 23 4 22 16 16 17 4 18 15 15 15 16 14 14 4 10 9 1 10 8 4 3 2 3 2 3 5 2 2 3 3 3 5 77 3170 3164 2 1 12 11 10 9 3166 467 3184 465 467 1 465 1 2490 703 2490 2484 8 4 4 4 4 5 5 1 5 5 4 4 4 2 2479 2478 591 2478 3149 1 4 4 4197 4199 4215 4198 1134 6 6 1132 3170 3172 3183 3168 3156 3176 3167 3184 3182 3175 3 3175 3183 2479 2480 2477 3158 1405 1405 1404 1405 1402 1405 82 1068 434 1065 1068 1 1552 1556 1550 1409 1297 1068 1070 1065 1068 946 130 130 1067 1069 1068 1069 1 1070 1070 1566 1556 1556 1569 1569 1568 1569 1573 1 1198 7 7 2 2 1 1 7 7 7 7 7 14 14 14 35 25 34 25 33 17 16 16 14 6 14 7 6 11 2 1 2 12 35 8 7 6 6 3 2 1 3 3 2 1 8 1 1 1 3177 3180 664 3189 3163 3179 576 575 573 11 2884 3172 2703 1408 3179 3168 3174 3185 12 3174 3166 11 11 3167 3161 704 704 690 1 689 104 691 91 91 691 1 691 1 690 11 690 148 148 3 691 697 699 347 347 347 347 347 346 347 347 346 347 347 347 1 1 4 19 19 1940 2728 2733 5 3 5 5 3 2 3 2 3 1 3 5 2717 2 4 3 4 4 3 2 3 2 3 1 4 403 403 403 403 2 2 2 596 147 591 592 131 131 497 594 1 595 589 593 39 37 593 405 265 264 401 399 400 401 400 239 400 3 590 559 554 553 553 556 332 510 511 508 47 48 48 48 553 2399 1947 1946 2397 2 2400 2404 2392 1952 1942 2396 2 2400 2 3075 3072 3041 3048 3032 3022 2650 3068 3033 927 927 69 69 929 926 69 930 2 2 2 2 2 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 347 347 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 // SPDX-License-Identifier: GPL-2.0-or-later /* * NETLINK Kernel-user communication protocol. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Patrick McHardy <kaber@trash.net> * * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith * added netlink_proto_exit * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br> * use nlk_sk, as sk->protinfo is on a diet 8) * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org> * - inc module use count of module that owns * the kernel socket in case userspace opens * socket of same protocol * - remove all module support, since netlink is * mandatory if CONFIG_NET=y these days */ #include <linux/module.h> #include <linux/bpf.h> #include <linux/capability.h> #include <linux/kernel.h> #include <linux/filter.h> #include <linux/init.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/fcntl.h> #include <linux/termios.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/security.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/random.h> #include <linux/bitops.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/audit.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> #include <linux/rhashtable.h> #include <asm/cacheflush.h> #include <linux/hash.h> #include <linux/net_namespace.h> #include <linux/nospec.h> #include <linux/btf_ids.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> #define CREATE_TRACE_POINTS #include <trace/events/netlink.h> #include "af_netlink.h" #include "genetlink.h" struct listeners { struct rcu_head rcu; unsigned long masks[]; }; /* state bits */ #define NETLINK_S_CONGESTED 0x0 static inline int netlink_is_kernel(struct sock *sk) { return nlk_test_bit(KERNEL_SOCKET, sk); } struct netlink_table *nl_table __read_mostly; EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS]; static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { "nlk_cb_mutex-ROUTE", "nlk_cb_mutex-1", "nlk_cb_mutex-USERSOCK", "nlk_cb_mutex-FIREWALL", "nlk_cb_mutex-SOCK_DIAG", "nlk_cb_mutex-NFLOG", "nlk_cb_mutex-XFRM", "nlk_cb_mutex-SELINUX", "nlk_cb_mutex-ISCSI", "nlk_cb_mutex-AUDIT", "nlk_cb_mutex-FIB_LOOKUP", "nlk_cb_mutex-CONNECTOR", "nlk_cb_mutex-NETFILTER", "nlk_cb_mutex-IP6_FW", "nlk_cb_mutex-DNRTMSG", "nlk_cb_mutex-KOBJECT_UEVENT", "nlk_cb_mutex-GENERIC", "nlk_cb_mutex-17", "nlk_cb_mutex-SCSITRANSPORT", "nlk_cb_mutex-ECRYPTFS", "nlk_cb_mutex-RDMA", "nlk_cb_mutex-CRYPTO", "nlk_cb_mutex-SMC", "nlk_cb_mutex-23", "nlk_cb_mutex-24", "nlk_cb_mutex-25", "nlk_cb_mutex-26", "nlk_cb_mutex-27", "nlk_cb_mutex-28", "nlk_cb_mutex-29", "nlk_cb_mutex-30", "nlk_cb_mutex-31", "nlk_cb_mutex-MAX_LINKS" }; static int netlink_dump(struct sock *sk, bool lock_taken); /* nl_table locking explained: * Lookup and traversal are protected with an RCU read-side lock. Insertion * and removal are protected with per bucket lock while using RCU list * modification primitives and may run in parallel to RCU protected lookups. * Destruction of the Netlink socket may only occur *after* nl_table_lock has * been acquired * either during or after the socket has been removed from * the list and after an RCU grace period. */ DEFINE_RWLOCK(nl_table_lock); EXPORT_SYMBOL_GPL(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); static BLOCKING_NOTIFIER_HEAD(netlink_chain); static const struct rhashtable_params netlink_rhashtable_params; void do_trace_netlink_extack(const char *msg) { trace_netlink_extack(msg); } EXPORT_SYMBOL(do_trace_netlink_extack); static inline u32 netlink_group_mask(u32 group) { if (group > 32) return 0; return group ? 1 << (group - 1) : 0; } static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, gfp_t gfp_mask) { unsigned int len = skb->len; struct sk_buff *new; new = alloc_skb(len, gfp_mask); if (new == NULL) return NULL; NETLINK_CB(new).portid = NETLINK_CB(skb).portid; NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; NETLINK_CB(new).creds = NETLINK_CB(skb).creds; skb_put_data(new, skb->data, len); return new; } static unsigned int netlink_tap_net_id; struct netlink_tap_net { struct list_head netlink_tap_all; struct mutex netlink_tap_lock; }; int netlink_add_tap(struct netlink_tap *nt) { struct net *net = dev_net(nt->dev); struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); if (unlikely(nt->dev->type != ARPHRD_NETLINK)) return -EINVAL; mutex_lock(&nn->netlink_tap_lock); list_add_rcu(&nt->list, &nn->netlink_tap_all); mutex_unlock(&nn->netlink_tap_lock); __module_get(nt->module); return 0; } EXPORT_SYMBOL_GPL(netlink_add_tap); static int __netlink_remove_tap(struct netlink_tap *nt) { struct net *net = dev_net(nt->dev); struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); bool found = false; struct netlink_tap *tmp; mutex_lock(&nn->netlink_tap_lock); list_for_each_entry(tmp, &nn->netlink_tap_all, list) { if (nt == tmp) { list_del_rcu(&nt->list); found = true; goto out; } } pr_warn("__netlink_remove_tap: %p not found\n", nt); out: mutex_unlock(&nn->netlink_tap_lock); if (found) module_put(nt->module); return found ? 0 : -ENODEV; } int netlink_remove_tap(struct netlink_tap *nt) { int ret; ret = __netlink_remove_tap(nt); synchronize_net(); return ret; } EXPORT_SYMBOL_GPL(netlink_remove_tap); static __net_init int netlink_tap_init_net(struct net *net) { struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); INIT_LIST_HEAD(&nn->netlink_tap_all); mutex_init(&nn->netlink_tap_lock); return 0; } static struct pernet_operations netlink_tap_net_ops = { .init = netlink_tap_init_net, .id = &netlink_tap_net_id, .size = sizeof(struct netlink_tap_net), }; static bool netlink_filter_tap(const struct sk_buff *skb) { struct sock *sk = skb->sk; /* We take the more conservative approach and * whitelist socket protocols that may pass. */ switch (sk->sk_protocol) { case NETLINK_ROUTE: case NETLINK_USERSOCK: case NETLINK_SOCK_DIAG: case NETLINK_NFLOG: case NETLINK_XFRM: case NETLINK_FIB_LOOKUP: case NETLINK_NETFILTER: case NETLINK_GENERIC: return true; } return false; } static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *nskb; struct sock *sk = skb->sk; int ret = -ENOMEM; if (!net_eq(dev_net(dev), sock_net(sk))) return 0; dev_hold(dev); if (is_vmalloc_addr(skb->head)) nskb = netlink_to_full_skb(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); nskb->pkt_type = netlink_is_kernel(sk) ? PACKET_KERNEL : PACKET_USER; skb_reset_network_header(nskb); ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) ret = net_xmit_errno(ret); } dev_put(dev); return ret; } static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn) { int ret; struct netlink_tap *tmp; if (!netlink_filter_tap(skb)) return; list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) { ret = __netlink_deliver_tap_skb(skb, tmp->dev); if (unlikely(ret)) break; } } static void netlink_deliver_tap(struct net *net, struct sk_buff *skb) { struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); rcu_read_lock(); if (unlikely(!list_empty(&nn->netlink_tap_all))) __netlink_deliver_tap(skb, nn); rcu_read_unlock(); } static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, struct sk_buff *skb) { if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) netlink_deliver_tap(sock_net(dst), skb); } static void netlink_overrun(struct sock *sk) { if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) { if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { WRITE_ONCE(sk->sk_err, ENOBUFS); sk_error_report(sk); } } sk_drops_inc(sk); } static void netlink_rcv_wake(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); if (skb_queue_empty_lockless(&sk->sk_receive_queue)) clear_bit(NETLINK_S_CONGESTED, &nlk->state); if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); } static void netlink_skb_destructor(struct sk_buff *skb) { if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) vfree_atomic(skb->head); skb->head = NULL; } if (skb->sk != NULL) sock_rfree(skb); } static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { WARN_ON(skb->sk != NULL); skb->sk = sk; skb->destructor = netlink_skb_destructor; sk_mem_charge(sk, skb->truesize); } static void netlink_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); return; } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(nlk_sk(sk)->groups); } /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves * this, _but_ remember, it adds useless work on UP machines. */ void netlink_table_grab(void) __acquires(nl_table_lock) { might_sleep(); write_lock_irq(&nl_table_lock); if (atomic_read(&nl_table_users)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&nl_table_wait, &wait); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (atomic_read(&nl_table_users) == 0) break; write_unlock_irq(&nl_table_lock); schedule(); write_lock_irq(&nl_table_lock); } __set_current_state(TASK_RUNNING); remove_wait_queue(&nl_table_wait, &wait); } } void netlink_table_ungrab(void) __releases(nl_table_lock) { write_unlock_irq(&nl_table_lock); wake_up(&nl_table_wait); } static inline void netlink_lock_table(void) { unsigned long flags; /* read_lock() synchronizes us to netlink_table_grab */ read_lock_irqsave(&nl_table_lock, flags); atomic_inc(&nl_table_users); read_unlock_irqrestore(&nl_table_lock, flags); } static inline void netlink_unlock_table(void) { if (atomic_dec_and_test(&nl_table_users)) wake_up(&nl_table_wait); } struct netlink_compare_arg { possible_net_t pnet; u32 portid; }; /* Doing sizeof directly may yield 4 extra bytes on 64-bit. */ #define netlink_compare_arg_len \ (offsetof(struct netlink_compare_arg, portid) + sizeof(u32)) static inline int netlink_compare(struct rhashtable_compare_arg *arg, const void *ptr) { const struct netlink_compare_arg *x = arg->key; const struct netlink_sock *nlk = ptr; return nlk->portid != x->portid || !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); } static void netlink_compare_arg_init(struct netlink_compare_arg *arg, struct net *net, u32 portid) { memset(arg, 0, sizeof(*arg)); write_pnet(&arg->pnet, net); arg->portid = portid; } static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid, struct net *net) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, net, portid); return rhashtable_lookup_fast(&table->hash, &arg, netlink_rhashtable_params); } static int __netlink_insert(struct netlink_table *table, struct sock *sk) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); return rhashtable_lookup_insert_key(&table->hash, &arg, &nlk_sk(sk)->node, netlink_rhashtable_params); } static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { struct netlink_table *table = &nl_table[protocol]; struct sock *sk; rcu_read_lock(); sk = __netlink_lookup(table, portid, net); if (sk) sock_hold(sk); rcu_read_unlock(); return sk; } static const struct proto_ops netlink_ops; static void netlink_update_listeners(struct sock *sk) { struct netlink_table *tbl = &nl_table[sk->sk_protocol]; unsigned long mask; unsigned int i; struct listeners *listeners; listeners = nl_deref_protected(tbl->listeners); if (!listeners) return; for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { mask = 0; sk_for_each_bound(sk, &tbl->mc_list) { if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } listeners->masks[i] = mask; } /* this function is only called with the netlink table "grabbed", which * makes sure updates are visible before bind or setsockopt return. */ } static int netlink_insert(struct sock *sk, u32 portid) { struct netlink_table *table = &nl_table[sk->sk_protocol]; int err; lock_sock(sk); err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; if (nlk_sk(sk)->bound) goto err; /* portid can be read locklessly from netlink_getname(). */ WRITE_ONCE(nlk_sk(sk)->portid, portid); sock_hold(sk); err = __netlink_insert(table, sk); if (err) { /* In case the hashtable backend returns with -EBUSY * from here, it must not escape to the caller. */ if (unlikely(err == -EBUSY)) err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; sock_put(sk); goto err; } /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); /* Paired with lockless reads from netlink_bind(), * netlink_connect() and netlink_sendmsg(). */ WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); return err; } static void netlink_remove(struct sock *sk) { struct netlink_table *table; table = &nl_table[sk->sk_protocol]; if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node, netlink_rhashtable_params)) { WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } netlink_table_grab(); if (nlk_sk(sk)->subscriptions) { __sk_del_bind_node(sk); netlink_update_listeners(sk); } if (sk->sk_protocol == NETLINK_GENERIC) atomic_inc(&genl_sk_destructing_cnt); netlink_table_ungrab(); } static struct proto netlink_proto = { .name = "NETLINK", .owner = THIS_MODULE, .obj_size = sizeof(struct netlink_sock), }; static int __netlink_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); nlk = nlk_sk(sk); mutex_init(&nlk->nl_cb_mutex); lockdep_set_class_and_name(&nlk->nl_cb_mutex, nlk_cb_mutex_keys + protocol, nlk_cb_mutex_key_strings[protocol]); init_waitqueue_head(&nlk->wait); sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; return 0; } static int netlink_create(struct net *net, struct socket *sock, int protocol, int kern) { struct module *module = NULL; struct netlink_sock *nlk; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release)(struct sock *sock, unsigned long *groups); int err = 0; sock->state = SS_UNCONNECTED; if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; if (protocol < 0 || protocol >= MAX_LINKS) return -EPROTONOSUPPORT; protocol = array_index_nospec(protocol, MAX_LINKS); netlink_lock_table(); #ifdef CONFIG_MODULES if (!nl_table[protocol].registered) { netlink_unlock_table(); request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); netlink_lock_table(); } #endif if (nl_table[protocol].registered && try_module_get(nl_table[protocol].module)) module = nl_table[protocol].module; else err = -EPROTONOSUPPORT; bind = nl_table[protocol].bind; unbind = nl_table[protocol].unbind; release = nl_table[protocol].release; netlink_unlock_table(); if (err < 0) goto out; err = __netlink_create(net, sock, protocol, kern); if (err < 0) goto out_module; sock_prot_inuse_add(net, &netlink_proto, 1); nlk = nlk_sk(sock->sk); nlk->module = module; nlk->netlink_bind = bind; nlk->netlink_unbind = unbind; nlk->netlink_release = release; out: return err; out_module: module_put(module); goto out; } static void deferred_put_nlk_sk(struct rcu_head *head) { struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu); struct sock *sk = &nlk->sk; kfree(nlk->groups); nlk->groups = NULL; if (!refcount_dec_and_test(&sk->sk_refcnt)) return; sk_free(sk); } static int netlink_release(struct socket *sock) { struct sock *sk = sock->sk; struct netlink_sock *nlk; if (!sk) return 0; netlink_remove(sk); sock_orphan(sk); nlk = nlk_sk(sk); /* * OK. Socket is unlinked, any packets that arrive now * will be purged. */ if (nlk->netlink_release) nlk->netlink_release(sk, nlk->groups); /* must not acquire netlink_table_lock in any way again before unbind * and notifying genetlink is done as otherwise it might deadlock */ if (nlk->netlink_unbind) { int i; for (i = 0; i < nlk->ngroups; i++) if (test_bit(i, nlk->groups)) nlk->netlink_unbind(sock_net(sk), i + 1); } if (sk->sk_protocol == NETLINK_GENERIC && atomic_dec_return(&genl_sk_destructing_cnt) == 0) wake_up(&genl_sk_destructing_waitq); sock->sk = NULL; wake_up_interruptible_all(&nlk->wait); skb_queue_purge(&sk->sk_write_queue); if (nlk->portid && nlk->bound) { struct netlink_notify n = { .net = sock_net(sk), .protocol = sk->sk_protocol, .portid = nlk->portid, }; blocking_notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); } /* Terminate any outstanding dump */ if (nlk->cb_running) { if (nlk->cb.done) nlk->cb.done(&nlk->cb); module_put(nlk->cb.module); kfree_skb(nlk->cb.skb); WRITE_ONCE(nlk->cb_running, false); } module_put(nlk->module); if (netlink_is_kernel(sk)) { netlink_table_grab(); BUG_ON(nl_table[sk->sk_protocol].registered == 0); if (--nl_table[sk->sk_protocol].registered == 0) { struct listeners *old; old = nl_deref_protected(nl_table[sk->sk_protocol].listeners); RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL); kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; nl_table[sk->sk_protocol].unbind = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } netlink_table_ungrab(); } sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_table *table = &nl_table[sk->sk_protocol]; s32 portid = task_tgid_vnr(current); int err; s32 rover = -4096; bool ok; retry: cond_resched(); rcu_read_lock(); ok = !__netlink_lookup(table, portid, net); rcu_read_unlock(); if (!ok) { /* Bind collision, search negative portid values. */ if (rover == -4096) /* rover will be in range [S32_MIN, -4097] */ rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN); else if (rover >= -4096) rover = -4097; portid = rover--; goto retry; } err = netlink_insert(sk, portid); if (err == -EADDRINUSE) goto retry; /* If 2 threads race to autobind, that is fine. */ if (err == -EBUSY) err = 0; return err; } /** * __netlink_ns_capable - General netlink message capability test * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace. * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in the user namespace @user_ns. */ bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *user_ns, int cap) { return ((nsp->flags & NETLINK_SKB_DST) || file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(__netlink_ns_capable); /** * netlink_ns_capable - General netlink message capability test * @skb: socket buffer holding a netlink command from userspace * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in the user namespace @user_ns. */ bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *user_ns, int cap) { return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap); } EXPORT_SYMBOL(netlink_ns_capable); /** * netlink_capable - Netlink global message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in all user namespaces. */ bool netlink_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, &init_user_ns, cap); } EXPORT_SYMBOL(netlink_capable); /** * netlink_net_capable - Netlink network namespace message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap over the network namespace of * the socket we received the message from. */ bool netlink_net_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap); } EXPORT_SYMBOL(netlink_net_capable); static inline int netlink_allowed(const struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].flags & flag) || ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN); } static void netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->subscriptions && !subscriptions) __sk_del_bind_node(sk); else if (!nlk->subscriptions && subscriptions) sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); nlk->subscriptions = subscriptions; } static int netlink_realloc_groups(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); unsigned int groups; unsigned long *new_groups; int err = 0; netlink_table_grab(); groups = nl_table[sk->sk_protocol].groups; if (!nl_table[sk->sk_protocol].registered) { err = -ENOENT; goto out_unlock; } if (nlk->ngroups >= groups) goto out_unlock; new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC); if (new_groups == NULL) { err = -ENOMEM; goto out_unlock; } memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0, NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups)); nlk->groups = new_groups; nlk->ngroups = groups; out_unlock: netlink_table_ungrab(); return err; } static void netlink_undo_bind(int group, long unsigned int groups, struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); int undo; if (!nlk->netlink_unbind) return; for (undo = 0; undo < group; undo++) if (test_bit(undo, &groups)) nlk->netlink_unbind(sock_net(sk), undo + 1); } static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err = 0; unsigned long groups; bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; if (nladdr->nl_family != AF_NETLINK) return -EINVAL; groups = nladdr->nl_groups; /* Only superuser is allowed to listen multicasts */ if (groups) { if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; } if (nlk->ngroups < BITS_PER_LONG) groups &= (1UL << nlk->ngroups) - 1; /* Paired with WRITE_ONCE() in netlink_insert() */ bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); if (nladdr->nl_pid != nlk->portid) return -EINVAL; } if (nlk->netlink_bind && groups) { int group; /* nl_groups is a u32, so cap the maximum groups we can bind */ for (group = 0; group < BITS_PER_TYPE(u32); group++) { if (!test_bit(group, &groups)) continue; err = nlk->netlink_bind(net, group + 1); if (!err) continue; netlink_undo_bind(group, groups, sk); return err; } } /* No need for barriers here as we return to user-space without * using any of the bound attributes. */ netlink_lock_table(); if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); if (err) { netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk); goto unlock; } } if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) goto unlock; netlink_unlock_table(); netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + hweight32(groups) - hweight32(nlk->groups[0])); nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups; netlink_update_listeners(sk); netlink_table_ungrab(); return 0; unlock: netlink_unlock_table(); return err; } static int netlink_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { int err = 0; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; if (alen < sizeof(addr->sa_family)) return -EINVAL; if (addr->sa_family == AF_UNSPEC) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, 0); WRITE_ONCE(nlk->dst_group, 0); return 0; } if (addr->sa_family != AF_NETLINK) return -EINVAL; if (alen < sizeof(struct sockaddr_nl)) return -EINVAL; if ((nladdr->nl_groups || nladdr->nl_pid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; /* No need for barriers here as we return to user-space without * using any of the bound attributes. * Paired with WRITE_ONCE() in netlink_insert(). */ if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid); WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups)); } return err; } static int netlink_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr); nladdr->nl_family = AF_NETLINK; nladdr->nl_pad = 0; if (peer) { /* Paired with WRITE_ONCE() in netlink_connect() */ nladdr->nl_pid = READ_ONCE(nlk->dst_portid); nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group)); } else { /* Paired with WRITE_ONCE() in netlink_insert() */ nladdr->nl_pid = READ_ONCE(nlk->portid); netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; netlink_unlock_table(); } return sizeof(*nladdr); } static int netlink_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { /* try to hand this ioctl down to the NIC drivers. */ return -ENOIOCTLCMD; } static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; struct netlink_sock *nlk; sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid); if (!sock) return ERR_PTR(-ECONNREFUSED); /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); /* dst_portid and sk_state can be changed in netlink_connect() */ if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED && READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) { sock_put(sock); return ERR_PTR(-ECONNREFUSED); } return sock; } struct sock *netlink_getsockbyfd(int fd) { CLASS(fd, f)(fd); struct inode *inode; struct sock *sock; if (fd_empty(f)) return ERR_PTR(-EBADF); inode = file_inode(fd_file(f)); if (!S_ISSOCK(inode->i_mode)) return ERR_PTR(-ENOTSOCK); sock = SOCKET_I(inode)->sk; if (sock->sk_family != AF_NETLINK) return ERR_PTR(-EINVAL); sock_hold(sock); return sock; } struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast) { size_t head_size = SKB_HEAD_ALIGN(size); struct sk_buff *skb; void *data; if (head_size <= PAGE_SIZE || broadcast) return alloc_skb(size, GFP_KERNEL); data = kvmalloc(head_size, GFP_KERNEL); if (!data) return NULL; skb = __build_skb(data, head_size); if (!skb) kvfree(data); else if (is_vmalloc_addr(data)) skb->destructor = netlink_skb_destructor; return skb; } /* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the * reference is dropped. The skb is not send to the destination, just all * all error checks are performed and memory in the queue is reserved. * Return values: * < 0: error. skb freed, reference to sock dropped. * 0: continue * 1: repeat lookup - reference dropped while waiting for socket memory. */ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk) { DECLARE_WAITQUEUE(wait, current); struct netlink_sock *nlk; unsigned int rmem; nlk = nlk_sk(sk); rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc); if ((rmem == skb->truesize || rmem <= READ_ONCE(sk->sk_rcvbuf)) && !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); return 0; } atomic_sub(skb->truesize, &sk->sk_rmem_alloc); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) netlink_overrun(sk); sock_put(sk); kfree_skb(skb); return -EAGAIN; } __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&nlk->wait, &wait); rmem = atomic_read(&sk->sk_rmem_alloc); if (((rmem && rmem + skb->truesize > READ_ONCE(sk->sk_rcvbuf)) || test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&nlk->wait, &wait); sock_put(sk); if (signal_pending(current)) { kfree_skb(skb); return sock_intr_errno(*timeo); } return 1; } static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; netlink_deliver_tap(sock_net(sk), skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return len; } int netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = __netlink_sendskb(sk, skb); sock_put(sk); return len; } void netlink_detachskb(struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); sock_put(sk); } static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; skb_assert_len(skb); WARN_ON(skb->sk != NULL); delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, allocation); if (!nskb) return skb; consume_skb(skb); skb = nskb; } pskb_expand_head(skb, 0, -delta, (allocation & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); return skb; } static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, struct sock *ssk) { int ret; struct netlink_sock *nlk = nlk_sk(sk); ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; atomic_add(skb->truesize, &sk->sk_rmem_alloc); netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; netlink_deliver_tap_kernel(sk, ssk, skb); nlk->netlink_rcv(skb); consume_skb(skb); } else { kfree_skb(skb); } sock_put(sk); return ret; } int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 portid, int nonblock) { struct sock *sk; int err; long timeo; skb = netlink_trim(skb, gfp_any()); timeo = sock_sndtimeo(ssk, nonblock); retry: sk = netlink_getsockbyportid(ssk, portid); if (IS_ERR(sk)) { kfree_skb(skb); return PTR_ERR(sk); } if (netlink_is_kernel(sk)) return netlink_unicast_kernel(sk, skb, ssk); if (sk_filter(sk, skb)) { err = skb->len; kfree_skb(skb); sock_put(sk); return err; } err = netlink_attachskb(sk, skb, &timeo, ssk); if (err == 1) goto retry; if (err) return err; return netlink_sendskb(sk, skb); } EXPORT_SYMBOL(netlink_unicast); int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; struct listeners *listeners; BUG_ON(!netlink_is_kernel(sk)); rcu_read_lock(); listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); if (listeners && group - 1 < nl_table[sk->sk_protocol].groups) res = test_bit(group - 1, listeners->masks); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(netlink_has_listeners); bool netlink_strict_get_check(struct sk_buff *skb) { return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); } EXPORT_SYMBOL_GPL(netlink_strict_get_check); static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); unsigned int rmem, rcvbuf; rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); if ((rmem == skb->truesize || rmem <= rcvbuf) && !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return rmem > (rcvbuf >> 1); } atomic_sub(skb->truesize, &sk->sk_rmem_alloc); return -1; } struct netlink_broadcast_data { struct sock *exclude_sk; struct net *net; u32 portid; u32 group; int failure; int delivery_failure; int congested; int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); void *tx_data; }; static void do_one_broadcast(struct sock *sk, struct netlink_broadcast_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int val; if (p->exclude_sk == sk) return; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) return; if (!net_eq(sock_net(sk), p->net)) { if (!nlk_test_bit(LISTEN_ALL_NSID, sk)) return; if (!peernet_has_id(sock_net(sk), p->net)) return; if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, CAP_NET_BROADCAST)) return; } if (p->failure) { netlink_overrun(sk); return; } sock_hold(sk); if (p->skb2 == NULL) { if (skb_shared(p->skb)) { p->skb2 = skb_clone(p->skb, p->allocation); } else { p->skb2 = skb_get(p->skb); /* * skb ownership may have been set when * delivered to a previous socket. */ skb_orphan(p->skb2); } } if (p->skb2 == NULL) { netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; goto out; } if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) NETLINK_CB(p->skb2).nsid_is_set = true; val = netlink_broadcast_deliver(sk, p->skb2); if (val < 0) { netlink_overrun(sk); if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; p->skb2 = NULL; } out: sock_put(sk); } int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; struct sock *sk; skb = netlink_trim(skb, allocation); info.exclude_sk = ssk; info.net = net; info.portid = portid; info.group = group; info.failure = 0; info.delivery_failure = 0; info.congested = 0; info.delivered = 0; info.allocation = allocation; info.skb = skb; info.skb2 = NULL; info.tx_filter = filter; info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ netlink_lock_table(); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); consume_skb(skb); netlink_unlock_table(); if (info.delivery_failure) { kfree_skb(info.skb2); return -ENOBUFS; } consume_skb(info.skb2); if (info.delivered) { if (info.congested && gfpflags_allow_blocking(allocation)) yield(); return 0; } return -ESRCH; } EXPORT_SYMBOL(netlink_broadcast_filtered); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation) { return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, NULL, NULL); } EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { struct sock *exclude_sk; u32 portid; u32 group; int code; }; static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int ret = 0; if (sk == p->exclude_sk) goto out; if (!net_eq(sock_net(sk), sock_net(p->exclude_sk))) goto out; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) goto out; if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) { ret = 1; goto out; } WRITE_ONCE(sk->sk_err, p->code); sk_error_report(sk); out: return ret; } /** * netlink_set_err - report error to broadcast listeners * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() * @portid: the PORTID of a process that we want to skip (if any) * @group: the broadcast group that will notice the error * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_NO_ENOBUFS socket option. */ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { struct netlink_set_err_data info; unsigned long flags; struct sock *sk; int ret = 0; info.exclude_sk = ssk; info.portid = portid; info.group = group; /* sk->sk_err wants a positive error value */ info.code = -code; read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) ret += do_one_set_err(sk, &info); read_unlock_irqrestore(&nl_table_lock, flags); return ret; } EXPORT_SYMBOL(netlink_set_err); /* must be called with netlink table grabbed */ static void netlink_update_socket_mc(struct netlink_sock *nlk, unsigned int group, int is_new) { int old, new = !!is_new, subscriptions; old = test_bit(group - 1, nlk->groups); subscriptions = nlk->subscriptions - old + new; __assign_bit(group - 1, nlk->groups, new); netlink_update_subscriptions(&nlk->sk, subscriptions); netlink_update_listeners(&nlk->sk); } static int netlink_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int val = 0; int nr = -1; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (optlen >= sizeof(int) && copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { case NETLINK_PKTINFO: nr = NETLINK_F_RECV_PKTINFO; break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { int err; if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; if (!val || val - 1 >= nlk->ngroups) return -EINVAL; if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) { err = nlk->netlink_bind(sock_net(sk), val); if (err) return err; } netlink_table_grab(); netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP); netlink_table_ungrab(); if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) nlk->netlink_unbind(sock_net(sk), val); break; } case NETLINK_BROADCAST_ERROR: nr = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val); if (val) { clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } break; case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; nr = NETLINK_F_LISTEN_ALL_NSID; break; case NETLINK_CAP_ACK: nr = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: nr = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: nr = NETLINK_F_STRICT_CHK; break; default: return -ENOPROTOOPT; } if (nr >= 0) assign_bit(nr, &nlk->flags, val); return 0; } static int netlink_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int flag; int len, val; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case NETLINK_PKTINFO: flag = NETLINK_F_RECV_PKTINFO; break; case NETLINK_BROADCAST_ERROR: flag = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: flag = NETLINK_F_RECV_NO_ENOBUFS; break; case NETLINK_LIST_MEMBERSHIPS: { int pos, idx, shift, err = 0; netlink_lock_table(); for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { if (len - pos < sizeof(u32)) break; idx = pos / sizeof(unsigned long); shift = (pos % sizeof(unsigned long)) * 8; if (put_user((u32)(nlk->groups[idx] >> shift), (u32 __user *)(optval + pos))) { err = -EFAULT; break; } } if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen)) err = -EFAULT; netlink_unlock_table(); return err; } case NETLINK_LISTEN_ALL_NSID: flag = NETLINK_F_LISTEN_ALL_NSID; break; case NETLINK_CAP_ACK: flag = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: flag = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: flag = NETLINK_F_STRICT_CHK; break; default: return -ENOPROTOOPT; } if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = test_bit(flag, &nlk->flags); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct nl_pktinfo info; info.group = NETLINK_CB(skb).dst_group; put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { if (!NETLINK_CB(skb).nsid_is_set) return; put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), &NETLINK_CB(skb).nsid); } static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); u32 dst_portid; u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; u32 netlink_skb_flags = 0; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (len == 0) { pr_warn_once("Zero length message leads to an empty skb\n"); return -ENODATA; } err = scm_send(sock, msg, &scm, true); if (err < 0) return err; if (msg->msg_namelen) { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_nl)) goto out; if (addr->nl_family != AF_NETLINK) goto out; dst_portid = addr->nl_pid; dst_group = ffs(addr->nl_groups); err = -EPERM; if ((dst_group || dst_portid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) goto out; netlink_skb_flags |= NETLINK_SKB_DST; } else { /* Paired with WRITE_ONCE() in netlink_connect() */ dst_portid = READ_ONCE(nlk->dst_portid); dst_group = READ_ONCE(nlk->dst_group); } /* Paired with WRITE_ONCE() in netlink_insert() */ if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; } else { /* Ensure nlk is hashed and visible. */ smp_rmb(); } err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; skb = netlink_alloc_large_skb(len, dst_group); if (skb == NULL) goto out; NETLINK_CB(skb).portid = nlk->portid; NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).creds = scm.creds; NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; if (memcpy_from_msg(skb_put(skb, len), msg, len)) { kfree_skb(skb); goto out; } err = security_netlink_send(sk, skb); if (err) { kfree_skb(skb); goto out; } if (dst_group) { refcount_inc(&skb->users); netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL); } err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT); out: scm_destroy(&scm); return err; } static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct scm_cookie scm; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); size_t copied, max_recvmsg_len; struct sk_buff *skb, *data_skb; int err, ret; if (flags & MSG_OOB) return -EOPNOTSUPP; copied = 0; skb = skb_recv_datagram(sk, flags, &err); if (skb == NULL) goto out; data_skb = skb; #ifdef CONFIG_COMPAT_NETLINK_MESSAGES if (unlikely(skb_shinfo(skb)->frag_list)) { /* * If this skb has a frag_list, then here that means that we * will have to use the frag_list skb's data for compat tasks * and the regular skb's data for normal (non-compat) tasks. * * If we need to send the compat skb, assign it to the * 'data_skb' variable so that it will be used below for data * copying. We keep 'skb' for everything else, including * freeing both later. */ if (flags & MSG_CMSG_COMPAT) data_skb = skb_shinfo(skb)->frag_list; } #endif /* Record the max length of recvmsg() calls for future allocations */ max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len); max_recvmsg_len = min_t(size_t, max_recvmsg_len, SKB_WITH_OVERHEAD(32768)); WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len); copied = data_skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).portid; addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); msg->msg_namelen = sizeof(*addr); } if (nlk_test_bit(RECV_PKTINFO, sk)) netlink_cmsg_recv_pktinfo(msg, skb); if (nlk_test_bit(LISTEN_ALL_NSID, sk)) netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); scm.creds = *NETLINK_CREDS(skb); if (flags & MSG_TRUNC) copied = data_skb->len; skb_free_datagram(sk, skb); if (READ_ONCE(nlk->cb_running) && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk, false); if (ret) { WRITE_ONCE(sk->sk_err, -ret); sk_error_report(sk); } } scm_recv(sock, msg, &scm, flags); out: netlink_rcv_wake(sk); return err ? : copied; } static void netlink_data_ready(struct sock *sk) { BUG(); } /* * We export these functions to other modules. They provide a * complete set of kernel non-blocking support for message * queueing. */ struct sock * __netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg) { struct socket *sock; struct sock *sk; struct netlink_sock *nlk; struct listeners *listeners = NULL; unsigned int groups; BUG_ON(!nl_table); if (unit < 0 || unit >= MAX_LINKS) return NULL; if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; if (__netlink_create(net, sock, unit, 1) < 0) goto out_sock_release_nosk; sk = sock->sk; if (!cfg || cfg->groups < 32) groups = 32; else groups = cfg->groups; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) goto out_sock_release; sk->sk_data_ready = netlink_data_ready; if (cfg && cfg->input) nlk_sk(sk)->netlink_rcv = cfg->input; if (netlink_insert(sk, 0)) goto out_sock_release; nlk = nlk_sk(sk); set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags); netlink_table_grab(); if (!nl_table[unit].registered) { nl_table[unit].groups = groups; rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].module = module; if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].unbind = cfg->unbind; nl_table[unit].release = cfg->release; nl_table[unit].flags = cfg->flags; } nl_table[unit].registered = 1; } else { kfree(listeners); nl_table[unit].registered++; } netlink_table_ungrab(); return sk; out_sock_release: kfree(listeners); netlink_kernel_release(sk); return NULL; out_sock_release_nosk: sock_release(sock); return NULL; } EXPORT_SYMBOL(__netlink_kernel_create); void netlink_kernel_release(struct sock *sk) { if (sk == NULL || sk->sk_socket == NULL) return; sock_release(sk->sk_socket); } EXPORT_SYMBOL(netlink_kernel_release); int __netlink_change_ngroups(struct sock *sk, unsigned int groups) { struct listeners *new, *old; struct netlink_table *tbl = &nl_table[sk->sk_protocol]; if (groups < 32) groups = 32; if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); if (!new) return -ENOMEM; old = nl_deref_protected(tbl->listeners); memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); rcu_assign_pointer(tbl->listeners, new); kfree_rcu(old, rcu); } tbl->groups = groups; return 0; } /** * netlink_change_ngroups - change number of multicast groups * * This changes the number of multicast groups that are available * on a certain netlink family. Note that it is not possible to * change the number of groups to below 32. Also note that it does * not implicitly call netlink_clear_multicast_users() when the * number of groups is reduced. * * @sk: The kernel netlink socket, as returned by netlink_kernel_create(). * @groups: The new number of groups. */ int netlink_change_ngroups(struct sock *sk, unsigned int groups) { int err; netlink_table_grab(); err = __netlink_change_ngroups(sk, groups); netlink_table_ungrab(); return err; } void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) { struct sock *sk; struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; struct hlist_node *tmp; sk_for_each_bound_safe(sk, tmp, &tbl->mc_list) netlink_update_socket_mc(nlk_sk(sk), group, 0); } struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags) { struct nlmsghdr *nlh; int size = nlmsg_msg_size(len); nlh = skb_put(skb, NLMSG_ALIGN(size)); nlh->nlmsg_type = type; nlh->nlmsg_len = size; nlh->nlmsg_flags = flags; nlh->nlmsg_pid = portid; nlh->nlmsg_seq = seq; if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0) memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size); return nlh; } EXPORT_SYMBOL(__nlmsg_put); static size_t netlink_ack_tlv_len(struct netlink_sock *nlk, int err, const struct netlink_ext_ack *extack) { size_t tlvlen; if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) return 0; tlvlen = 0; if (extack->_msg) tlvlen += nla_total_size(strlen(extack->_msg) + 1); if (extack->cookie_len) tlvlen += nla_total_size(extack->cookie_len); /* Following attributes are only reported as error (not warning) */ if (!err) return tlvlen; if (extack->bad_attr) tlvlen += nla_total_size(sizeof(u32)); if (extack->policy) tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy); if (extack->miss_type) tlvlen += nla_total_size(sizeof(u32)); if (extack->miss_nest) tlvlen += nla_total_size(sizeof(u32)); return tlvlen; } static bool nlmsg_check_in_payload(const struct nlmsghdr *nlh, const void *addr) { return !WARN_ON(addr < nlmsg_data(nlh) || addr - (const void *) nlh >= nlh->nlmsg_len); } static void netlink_ack_tlv_fill(struct sk_buff *skb, const struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { if (extack->_msg) WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); if (extack->cookie_len) WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, extack->cookie_len, extack->cookie)); if (!err) return; if (extack->bad_attr && nlmsg_check_in_payload(nlh, extack->bad_attr)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, (u8 *)extack->bad_attr - (const u8 *)nlh)); if (extack->policy) netlink_policy_dump_write_attr(skb, extack->policy, NLMSGERR_ATTR_POLICY); if (extack->miss_type) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE, extack->miss_type)); if (extack->miss_nest && nlmsg_check_in_payload(nlh, extack->miss_nest)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST, (u8 *)extack->miss_nest - (const u8 *)nlh)); } /* * It looks a bit ugly. * It would be better to create kernel thread. */ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, struct netlink_callback *cb, struct netlink_ext_ack *extack) { struct nlmsghdr *nlh; size_t extack_len; nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno), NLM_F_MULTI | cb->answer_flags); if (WARN_ON(!nlh)) return -ENOBUFS; nl_dump_check_consistent(cb, nlh); memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); extack_len = netlink_ack_tlv_len(nlk, nlk->dump_done_errno, extack); if (extack_len) { nlh->nlmsg_flags |= NLM_F_ACK_TLVS; if (skb_tailroom(skb) >= extack_len) { netlink_ack_tlv_fill(skb, cb->nlh, nlk->dump_done_errno, extack); nlmsg_end(skb, nlh); } } return 0; } static int netlink_dump(struct sock *sk, bool lock_taken) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; unsigned int rmem, rcvbuf; size_t max_recvmsg_len; struct module *module; int err = -ENOBUFS; int alloc_min_size; int alloc_size; if (!lock_taken) mutex_lock(&nlk->nl_cb_mutex); if (!nlk->cb_running) { err = -EINVAL; goto errout_skb; } /* NLMSG_GOODSIZE is small to avoid high order allocations being * required, but it makes sense to _attempt_ a 32KiB allocation * to reduce number of system calls on dump operations, if user * ever provided a big enough buffer. */ cb = &nlk->cb; alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len); if (alloc_min_size < max_recvmsg_len) { alloc_size = max_recvmsg_len; skb = alloc_skb(alloc_size, (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); } if (!skb) { alloc_size = alloc_min_size; skb = alloc_skb(alloc_size, GFP_KERNEL); } if (!skb) goto errout_skb; rcvbuf = READ_ONCE(sk->sk_rcvbuf); rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc); if (rmem != skb->truesize && rmem >= rcvbuf) { atomic_sub(skb->truesize, &sk->sk_rmem_alloc); goto errout_skb; } /* Trim skb to allocated size. User is expected to provide buffer as * large as max(min_dump_alloc, 32KiB (max_recvmsg_len capped at * netlink_recvmsg())). dump will pack as many smaller messages as * could fit within the allocated skb. skb is typically allocated * with larger space than required (could be as much as near 2x the * requested size with align to next power of 2 approach). Allowing * dump to use the excess space makes it difficult for a user to have a * reasonable static buffer based on the expected largest dump of a * single netdev. The outcome is MSG_TRUNC error. */ skb_reserve(skb, skb_tailroom(skb) - alloc_size); /* Make sure malicious BPF programs can not read unitialized memory * from skb->head -> skb->data */ skb_reset_network_header(skb); skb_reset_mac_header(skb); netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) { cb->extack = &extack; nlk->dump_done_errno = cb->dump(skb, cb); /* EMSGSIZE plus something already in the skb means * that there's more to dump but current skb has filled up. * If the callback really wants to return EMSGSIZE to user space * it needs to do so again, on the next cb->dump() call, * without putting data in the skb. */ if (nlk->dump_done_errno == -EMSGSIZE && skb->len) nlk->dump_done_errno = skb->len; cb->extack = NULL; } if (nlk->dump_done_errno > 0 || skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { mutex_unlock(&nlk->nl_cb_mutex); if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); return 0; } if (netlink_dump_done(nlk, skb, cb, &extack)) goto errout_skb; #ifdef CONFIG_COMPAT_NETLINK_MESSAGES /* frag_list skb's data is used for compat tasks * and the regular skb's data for normal (non-compat) tasks. * See netlink_recvmsg(). */ if (unlikely(skb_shinfo(skb)->frag_list)) { if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack)) goto errout_skb; } #endif if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); if (cb->done) cb->done(cb); WRITE_ONCE(nlk->cb_running, false); module = cb->module; skb = cb->skb; mutex_unlock(&nlk->nl_cb_mutex); module_put(module); consume_skb(skb); return 0; errout_skb: mutex_unlock(&nlk->nl_cb_mutex); kfree_skb(skb); return err; } int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { struct netlink_callback *cb; struct netlink_sock *nlk; struct sock *sk; int ret; refcount_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { ret = -ECONNREFUSED; goto error_free; } nlk = nlk_sk(sk); mutex_lock(&nlk->nl_cb_mutex); /* A dump is in progress... */ if (nlk->cb_running) { ret = -EBUSY; goto error_unlock; } /* add reference of module which cb->dump belongs to */ if (!try_module_get(control->module)) { ret = -EPROTONOSUPPORT; goto error_unlock; } cb = &nlk->cb; memset(cb, 0, sizeof(*cb)); cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; cb->data = control->data; cb->module = control->module; cb->min_dump_alloc = control->min_dump_alloc; cb->flags = control->flags; cb->skb = skb; cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); if (control->start) { cb->extack = control->extack; ret = control->start(cb); cb->extack = NULL; if (ret) goto error_put; } WRITE_ONCE(nlk->cb_running, true); nlk->dump_done_errno = INT_MAX; ret = netlink_dump(sk, true); sock_put(sk); if (ret) return ret; /* We successfully started a dump, by returning -EINTR we * signal not to send ACK even if it was requested. */ return -EINTR; error_put: module_put(control->module); error_unlock: sock_put(sk); mutex_unlock(&nlk->nl_cb_mutex); error_free: kfree_skb(skb); return ret; } EXPORT_SYMBOL(__netlink_dump_start); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { struct sk_buff *skb; struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); unsigned int flags = 0; size_t tlvlen; /* Error messages get the original request appended, unless the user * requests to cap the error message, and get extra error data if * requested. */ if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags)) payload += nlmsg_len(nlh); else flags |= NLM_F_CAPPED; tlvlen = netlink_ack_tlv_len(nlk, err, extack); if (tlvlen) flags |= NLM_F_ACK_TLVS; skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) goto err_skb; rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, sizeof(*errmsg), flags); if (!rep) goto err_bad_put; errmsg = nlmsg_data(rep); errmsg->error = err; errmsg->msg = *nlh; if (!(flags & NLM_F_CAPPED)) { if (!nlmsg_append(skb, nlmsg_len(nlh))) goto err_bad_put; memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh), nlmsg_len(nlh)); } if (tlvlen) netlink_ack_tlv_fill(skb, nlh, err, extack); nlmsg_end(skb, rep); nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid); return; err_bad_put: nlmsg_free(skb); err_skb: WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS); sk_error_report(NETLINK_CB(in_skb).sk); } EXPORT_SYMBOL(netlink_ack); int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)) { struct netlink_ext_ack extack; struct nlmsghdr *nlh; int err; while (skb->len >= nlmsg_total_size(0)) { int msglen; memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return 0; /* Only requests are handled by the kernel */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) goto ack; /* Skip control messages */ if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) netlink_ack(skb, nlh, err, &extack); skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } return 0; } EXPORT_SYMBOL(netlink_rcv_skb); /** * nlmsg_notify - send a notification netlink message * @sk: netlink socket to use * @skb: notification message * @portid: destination netlink portid for reports or 0 * @group: destination multicast group or 0 * @report: 1 to report back, 0 to disable * @flags: allocation flags */ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags) { int err = 0; if (group) { int exclude_portid = 0; if (report) { refcount_inc(&skb->users); exclude_portid = portid; } /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); if (err == -ESRCH) err = 0; } if (report) { int err2; err2 = nlmsg_unicast(sk, skb, portid); if (!err) err = err2; } return err; } EXPORT_SYMBOL(nlmsg_notify); #ifdef CONFIG_PROC_FS struct nl_seq_iter { struct seq_net_private p; struct rhashtable_iter hti; int link; }; static void netlink_walk_start(struct nl_seq_iter *iter) { rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti); rhashtable_walk_start(&iter->hti); } static void netlink_walk_stop(struct nl_seq_iter *iter) { rhashtable_walk_stop(&iter->hti); rhashtable_walk_exit(&iter->hti); } static void *__netlink_seq_next(struct seq_file *seq) { struct nl_seq_iter *iter = seq->private; struct netlink_sock *nlk; do { for (;;) { nlk = rhashtable_walk_next(&iter->hti); if (IS_ERR(nlk)) { if (PTR_ERR(nlk) == -EAGAIN) continue; return nlk; } if (nlk) break; netlink_walk_stop(iter); if (++iter->link >= MAX_LINKS) return NULL; netlink_walk_start(iter); } } while (sock_net(&nlk->sk) != seq_file_net(seq)); return nlk; } static void *netlink_seq_start(struct seq_file *seq, loff_t *posp) __acquires(RCU) { struct nl_seq_iter *iter = seq->private; void *obj = SEQ_START_TOKEN; loff_t pos; iter->link = 0; netlink_walk_start(iter); for (pos = *posp; pos && obj && !IS_ERR(obj); pos--) obj = __netlink_seq_next(seq); return obj; } static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return __netlink_seq_next(seq); } static void netlink_native_seq_stop(struct seq_file *seq, void *v) { struct nl_seq_iter *iter = seq->private; if (iter->link >= MAX_LINKS) return; netlink_walk_stop(iter); } static int netlink_native_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk Eth Pid Groups " "Rmem Wmem Dump Locks Drops Inode\n"); } else { struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n", s, s->sk_protocol, nlk->portid, nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), sk_drops_read(s), sock_i_ino(s) ); } return 0; } #ifdef CONFIG_BPF_SYSCALL struct bpf_iter__netlink { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct netlink_sock *, sk); }; DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk) static int netlink_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__netlink ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.sk = nlk_sk((struct sock *)v); return bpf_iter_run_prog(prog, &ctx); } static int netlink_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return netlink_native_seq_show(seq, v); if (v != SEQ_START_TOKEN) return netlink_prog_seq_show(prog, &meta, v); return 0; } static void netlink_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)netlink_prog_seq_show(prog, &meta, v); } netlink_native_seq_stop(seq, v); } #else static int netlink_seq_show(struct seq_file *seq, void *v) { return netlink_native_seq_show(seq, v); } static void netlink_seq_stop(struct seq_file *seq, void *v) { netlink_native_seq_stop(seq, v); } #endif static const struct seq_operations netlink_seq_ops = { .start = netlink_seq_start, .next = netlink_seq_next, .stop = netlink_seq_stop, .show = netlink_seq_show, }; #endif int netlink_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_register_notifier); int netlink_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_unregister_notifier); static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, .bind = netlink_bind, .connect = netlink_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, .poll = datagram_poll, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = netlink_setsockopt, .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap = sock_no_mmap, }; static const struct net_proto_family netlink_family_ops = { .family = PF_NETLINK, .create = netlink_create, .owner = THIS_MODULE, /* for consistency 8) */ }; static int __net_init netlink_net_init(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops, sizeof(struct nl_seq_iter))) return -ENOMEM; #endif return 0; } static void __net_exit netlink_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("netlink", net->proc_net); #endif } static void __init netlink_add_usersock_entry(void) { struct listeners *listeners; int groups = 32; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) panic("netlink_add_usersock_entry: Cannot allocate listeners\n"); netlink_table_grab(); nl_table[NETLINK_USERSOCK].groups = groups; rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND; netlink_table_ungrab(); } static struct pernet_operations __net_initdata netlink_net_ops = { .init = netlink_net_init, .exit = netlink_net_exit, }; static inline u32 netlink_hash(const void *data, u32 len, u32 seed) { const struct netlink_sock *nlk = data; struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); } static const struct rhashtable_params netlink_rhashtable_params = { .head_offset = offsetof(struct netlink_sock, node), .key_len = netlink_compare_arg_len, .obj_hashfn = netlink_hash, .obj_cmpfn = netlink_compare, .automatic_shrinking = true, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) BTF_ID_LIST_SINGLE(btf_netlink_sock_id, struct, netlink_sock) static const struct bpf_iter_seq_info netlink_seq_info = { .seq_ops = &netlink_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct nl_seq_iter), }; static struct bpf_iter_reg netlink_reg_info = { .target = "netlink", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__netlink, sk), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &netlink_seq_info, }; static int __init bpf_iter_register(void) { netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id; return bpf_iter_reg_target(&netlink_reg_info); } #endif static int __init netlink_proto_init(void) { int i; int err = proto_register(&netlink_proto, 0); if (err != 0) goto out; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) err = bpf_iter_register(); if (err) goto out; #endif BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); if (!nl_table) goto panic; for (i = 0; i < MAX_LINKS; i++) { if (rhashtable_init(&nl_table[i].hash, &netlink_rhashtable_params) < 0) goto panic; } netlink_add_usersock_entry(); sock_register(&netlink_family_ops); register_pernet_subsys(&netlink_net_ops); register_pernet_subsys(&netlink_tap_net_ops); /* The netlink device handler may be needed early. */ rtnetlink_init(); out: return err; panic: panic("netlink_init: Cannot allocate nl_table\n"); } core_initcall(netlink_proto_init);
5 5 5 5 5 5 5 5 5 5 5 3 5 5 5 2 5 3 1 3 3 3 3 3 3 3 3 5 6 6 6 6 6 6 3 6 5 5 5 5 5 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 // SPDX-License-Identifier: GPL-2.0-or-later /* * Simple MTD partitioning layer * * Copyright © 2000 Nicolas Pitre <nico@fluxnic.net> * Copyright © 2002 Thomas Gleixner <gleixner@linutronix.de> * Copyright © 2000-2010 David Woodhouse <dwmw2@infradead.org> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/kmod.h> #include <linux/mtd/mtd.h> #include <linux/mtd/partitions.h> #include <linux/err.h> #include <linux/of.h> #include <linux/of_platform.h> #include "mtdcore.h" /* * MTD methods which simply translate the effective address and pass through * to the _real_ device. */ static inline void free_partition(struct mtd_info *mtd) { kfree(mtd->name); kfree(mtd); } void release_mtd_partition(struct mtd_info *mtd) { WARN_ON(!list_empty(&mtd->part.node)); free_partition(mtd); } static struct mtd_info *allocate_partition(struct mtd_info *parent, const struct mtd_partition *part, int partno, uint64_t cur_offset) { struct mtd_info *master = mtd_get_master(parent); int wr_alignment = (parent->flags & MTD_NO_ERASE) ? master->writesize : master->erasesize; u64 parent_size = mtd_is_partition(parent) ? parent->part.size : parent->size; struct mtd_info *child; u32 remainder; char *name; u64 tmp; /* allocate the partition structure */ child = kzalloc(sizeof(*child), GFP_KERNEL); name = kstrdup(part->name, GFP_KERNEL); if (!name || !child) { printk(KERN_ERR"memory allocation error while creating partitions for \"%s\"\n", parent->name); kfree(name); kfree(child); return ERR_PTR(-ENOMEM); } /* set up the MTD object for this partition */ child->type = parent->type; child->part.flags = parent->flags & ~part->mask_flags; child->part.flags |= part->add_flags; child->flags = child->part.flags; child->part.size = part->size; child->writesize = parent->writesize; child->writebufsize = parent->writebufsize; child->oobsize = parent->oobsize; child->oobavail = parent->oobavail; child->subpage_sft = parent->subpage_sft; child->name = name; child->owner = parent->owner; /* NOTE: Historically, we didn't arrange MTDs as a tree out of * concern for showing the same data in multiple partitions. * However, it is very useful to have the master node present, * so the MTD_PARTITIONED_MASTER option allows that. The master * will have device nodes etc only if this is set, so make the * parent conditional on that option. Note, this is a way to * distinguish between the parent and its partitions in sysfs. */ child->dev.parent = IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) || mtd_is_partition(parent) ? &parent->dev : parent->dev.parent; child->dev.of_node = part->of_node; child->parent = parent; child->part.offset = part->offset; INIT_LIST_HEAD(&child->partitions); if (child->part.offset == MTDPART_OFS_APPEND) child->part.offset = cur_offset; if (child->part.offset == MTDPART_OFS_NXTBLK) { tmp = cur_offset; child->part.offset = cur_offset; remainder = do_div(tmp, wr_alignment); if (remainder) { child->part.offset += wr_alignment - remainder; printk(KERN_NOTICE "Moving partition %d: " "0x%012llx -> 0x%012llx\n", partno, (unsigned long long)cur_offset, child->part.offset); } } if (child->part.offset == MTDPART_OFS_RETAIN) { child->part.offset = cur_offset; if (parent_size - child->part.offset >= child->part.size) { child->part.size = parent_size - child->part.offset - child->part.size; } else { printk(KERN_ERR "mtd partition \"%s\" doesn't have enough space: %#llx < %#llx, disabled\n", part->name, parent_size - child->part.offset, child->part.size); /* register to preserve ordering */ goto out_register; } } if (child->part.size == MTDPART_SIZ_FULL) child->part.size = parent_size - child->part.offset; printk(KERN_NOTICE "0x%012llx-0x%012llx : \"%s\"\n", child->part.offset, child->part.offset + child->part.size, child->name); /* let's do some sanity checks */ if (child->part.offset >= parent_size) { /* let's register it anyway to preserve ordering */ child->part.offset = 0; child->part.size = 0; /* Initialize ->erasesize to make add_mtd_device() happy. */ child->erasesize = parent->erasesize; printk(KERN_ERR"mtd: partition \"%s\" is out of reach -- disabled\n", part->name); goto out_register; } if (child->part.offset + child->part.size > parent->size) { child->part.size = parent_size - child->part.offset; printk(KERN_WARNING"mtd: partition \"%s\" extends beyond the end of device \"%s\" -- size truncated to %#llx\n", part->name, parent->name, child->part.size); } if (parent->numeraseregions > 1) { /* Deal with variable erase size stuff */ int i, max = parent->numeraseregions; u64 end = child->part.offset + child->part.size; struct mtd_erase_region_info *regions = parent->eraseregions; /* Find the first erase regions which is part of this * partition. */ for (i = 0; i < max && regions[i].offset <= child->part.offset; i++) ; /* The loop searched for the region _behind_ the first one */ if (i > 0) i--; /* Pick biggest erasesize */ for (; i < max && regions[i].offset < end; i++) { if (child->erasesize < regions[i].erasesize) child->erasesize = regions[i].erasesize; } BUG_ON(child->erasesize == 0); } else { /* Single erase size */ child->erasesize = master->erasesize; } /* * Child erasesize might differ from the parent one if the parent * exposes several regions with different erasesize. Adjust * wr_alignment accordingly. */ if (!(child->flags & MTD_NO_ERASE)) wr_alignment = child->erasesize; tmp = mtd_get_master_ofs(child, 0); remainder = do_div(tmp, wr_alignment); if ((child->flags & MTD_WRITEABLE) && remainder) { /* Doesn't start on a boundary of major erase size */ /* FIXME: Let it be writable if it is on a boundary of * _minor_ erase size though */ child->flags &= ~MTD_WRITEABLE; printk(KERN_WARNING"mtd: partition \"%s\" doesn't start on an erase/write block boundary -- force read-only\n", part->name); } tmp = mtd_get_master_ofs(child, 0) + child->part.size; remainder = do_div(tmp, wr_alignment); if ((child->flags & MTD_WRITEABLE) && remainder) { child->flags &= ~MTD_WRITEABLE; printk(KERN_WARNING"mtd: partition \"%s\" doesn't end on an erase/write block -- force read-only\n", part->name); } child->size = child->part.size; child->ecc_step_size = parent->ecc_step_size; child->ecc_strength = parent->ecc_strength; child->bitflip_threshold = parent->bitflip_threshold; if (master->_block_isbad) { uint64_t offs = 0; while (offs < child->part.size) { if (mtd_block_isreserved(child, offs)) child->ecc_stats.bbtblocks++; else if (mtd_block_isbad(child, offs)) child->ecc_stats.badblocks++; offs += child->erasesize; } } out_register: return child; } static ssize_t offset_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%lld\n", mtd->part.offset); } static DEVICE_ATTR_RO(offset); /* mtd partition offset */ static const struct attribute *mtd_partition_attrs[] = { &dev_attr_offset.attr, NULL }; static int mtd_add_partition_attrs(struct mtd_info *new) { int ret = sysfs_create_files(&new->dev.kobj, mtd_partition_attrs); if (ret) printk(KERN_WARNING "mtd: failed to create partition attrs, err=%d\n", ret); return ret; } int mtd_add_partition(struct mtd_info *parent, const char *name, long long offset, long long length) { struct mtd_info *master = mtd_get_master(parent); u64 parent_size = mtd_is_partition(parent) ? parent->part.size : parent->size; struct mtd_partition part; struct mtd_info *child; int ret = 0; /* the direct offset is expected */ if (offset == MTDPART_OFS_APPEND || offset == MTDPART_OFS_NXTBLK) return -EINVAL; if (length == MTDPART_SIZ_FULL) length = parent_size - offset; if (length <= 0) return -EINVAL; memset(&part, 0, sizeof(part)); part.name = name; part.size = length; part.offset = offset; child = allocate_partition(parent, &part, -1, offset); if (IS_ERR(child)) return PTR_ERR(child); mutex_lock(&master->master.partitions_lock); list_add_tail(&child->part.node, &parent->partitions); mutex_unlock(&master->master.partitions_lock); ret = add_mtd_device(child); if (ret) goto err_remove_part; mtd_add_partition_attrs(child); return 0; err_remove_part: mutex_lock(&master->master.partitions_lock); list_del(&child->part.node); mutex_unlock(&master->master.partitions_lock); free_partition(child); return ret; } EXPORT_SYMBOL_GPL(mtd_add_partition); /** * __mtd_del_partition - delete MTD partition * * @mtd: MTD structure to be deleted * * This function must be called with the partitions mutex locked. */ static int __mtd_del_partition(struct mtd_info *mtd) { struct mtd_info *child, *next; int err; list_for_each_entry_safe(child, next, &mtd->partitions, part.node) { err = __mtd_del_partition(child); if (err) return err; } sysfs_remove_files(&mtd->dev.kobj, mtd_partition_attrs); list_del_init(&mtd->part.node); err = del_mtd_device(mtd); if (err) return err; return 0; } /* * This function unregisters and destroy all slave MTD objects which are * attached to the given MTD object, recursively. */ static int __del_mtd_partitions(struct mtd_info *mtd) { struct mtd_info *child, *next; int ret, err = 0; list_for_each_entry_safe(child, next, &mtd->partitions, part.node) { if (mtd_has_partitions(child)) __del_mtd_partitions(child); pr_info("Deleting %s MTD partition\n", child->name); list_del_init(&child->part.node); ret = del_mtd_device(child); if (ret < 0) { pr_err("Error when deleting partition \"%s\" (%d)\n", child->name, ret); err = ret; continue; } } return err; } int del_mtd_partitions(struct mtd_info *mtd) { struct mtd_info *master = mtd_get_master(mtd); int ret; pr_info("Deleting MTD partitions on \"%s\":\n", mtd->name); mutex_lock(&master->master.partitions_lock); ret = __del_mtd_partitions(mtd); mutex_unlock(&master->master.partitions_lock); return ret; } int mtd_del_partition(struct mtd_info *mtd, int partno) { struct mtd_info *child, *master = mtd_get_master(mtd); int ret = -EINVAL; mutex_lock(&master->master.partitions_lock); list_for_each_entry(child, &mtd->partitions, part.node) { if (child->index == partno) { ret = __mtd_del_partition(child); break; } } mutex_unlock(&master->master.partitions_lock); return ret; } EXPORT_SYMBOL_GPL(mtd_del_partition); /* * This function, given a parent MTD object and a partition table, creates * and registers the child MTD objects which are bound to the parent according * to the partition definitions. * * For historical reasons, this function's caller only registers the parent * if the MTD_PARTITIONED_MASTER config option is set. */ int add_mtd_partitions(struct mtd_info *parent, const struct mtd_partition *parts, int nbparts) { struct mtd_info *child, *master = mtd_get_master(parent); uint64_t cur_offset = 0; int i, ret; printk(KERN_NOTICE "Creating %d MTD partitions on \"%s\":\n", nbparts, parent->name); for (i = 0; i < nbparts; i++) { child = allocate_partition(parent, parts + i, i, cur_offset); if (IS_ERR(child)) { ret = PTR_ERR(child); goto err_del_partitions; } mutex_lock(&master->master.partitions_lock); list_add_tail(&child->part.node, &parent->partitions); mutex_unlock(&master->master.partitions_lock); ret = add_mtd_device(child); if (ret) { mutex_lock(&master->master.partitions_lock); list_del(&child->part.node); mutex_unlock(&master->master.partitions_lock); free_partition(child); goto err_del_partitions; } mtd_add_partition_attrs(child); /* Look for subpartitions */ ret = parse_mtd_partitions(child, parts[i].types, NULL); if (ret < 0) { pr_err("Failed to parse subpartitions: %d\n", ret); goto err_del_partitions; } cur_offset = child->part.offset + child->part.size; } return 0; err_del_partitions: del_mtd_partitions(master); return ret; } static DEFINE_SPINLOCK(part_parser_lock); static LIST_HEAD(part_parsers); static struct mtd_part_parser *mtd_part_parser_get(const char *name) { struct mtd_part_parser *p, *ret = NULL; spin_lock(&part_parser_lock); list_for_each_entry(p, &part_parsers, list) if (!strcmp(p->name, name) && try_module_get(p->owner)) { ret = p; break; } spin_unlock(&part_parser_lock); return ret; } static inline void mtd_part_parser_put(const struct mtd_part_parser *p) { module_put(p->owner); } /* * Many partition parsers just expected the core to kfree() all their data in * one chunk. Do that by default. */ static void mtd_part_parser_cleanup_default(const struct mtd_partition *pparts, int nr_parts) { kfree(pparts); } int __register_mtd_parser(struct mtd_part_parser *p, struct module *owner) { p->owner = owner; if (!p->cleanup) p->cleanup = &mtd_part_parser_cleanup_default; spin_lock(&part_parser_lock); list_add(&p->list, &part_parsers); spin_unlock(&part_parser_lock); return 0; } EXPORT_SYMBOL_GPL(__register_mtd_parser); void deregister_mtd_parser(struct mtd_part_parser *p) { spin_lock(&part_parser_lock); list_del(&p->list); spin_unlock(&part_parser_lock); } EXPORT_SYMBOL_GPL(deregister_mtd_parser); /* * Do not forget to update 'parse_mtd_partitions()' kerneldoc comment if you * are changing this array! */ static const char * const default_mtd_part_types[] = { "cmdlinepart", "ofpart", NULL }; /* Check DT only when looking for subpartitions. */ static const char * const default_subpartition_types[] = { "ofpart", NULL }; static int mtd_part_do_parse(struct mtd_part_parser *parser, struct mtd_info *master, struct mtd_partitions *pparts, struct mtd_part_parser_data *data) { int ret; ret = (*parser->parse_fn)(master, &pparts->parts, data); pr_debug("%s: parser %s: %i\n", master->name, parser->name, ret); if (ret <= 0) return ret; pr_notice("%d %s partitions found on MTD device %s\n", ret, parser->name, master->name); pparts->nr_parts = ret; pparts->parser = parser; return ret; } /** * mtd_part_get_compatible_parser - find MTD parser by a compatible string * * @compat: compatible string describing partitions in a device tree * * MTD parsers can specify supported partitions by providing a table of * compatibility strings. This function finds a parser that advertises support * for a passed value of "compatible". */ static struct mtd_part_parser *mtd_part_get_compatible_parser(const char *compat) { struct mtd_part_parser *p, *ret = NULL; spin_lock(&part_parser_lock); list_for_each_entry(p, &part_parsers, list) { const struct of_device_id *matches; matches = p->of_match_table; if (!matches) continue; for (; matches->compatible[0]; matches++) { if (!strcmp(matches->compatible, compat) && try_module_get(p->owner)) { ret = p; break; } } if (ret) break; } spin_unlock(&part_parser_lock); return ret; } static int mtd_part_of_parse(struct mtd_info *master, struct mtd_partitions *pparts) { struct mtd_part_parser *parser; struct device_node *np; struct device_node *child; struct property *prop; struct device *dev; const char *compat; const char *fixed = "fixed-partitions"; int ret, err = 0; dev = &master->dev; /* Use parent device (controller) if the top level MTD is not registered */ if (!IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) && !mtd_is_partition(master)) dev = master->dev.parent; np = mtd_get_of_node(master); if (mtd_is_partition(master)) of_node_get(np); else np = of_get_child_by_name(np, "partitions"); /* * Don't create devices that are added to a bus but will never get * probed. That'll cause fw_devlink to block probing of consumers of * this partition until the partition device is probed. */ for_each_child_of_node(np, child) if (of_device_is_compatible(child, "nvmem-cells")) of_node_set_flag(child, OF_POPULATED); of_property_for_each_string(np, "compatible", prop, compat) { parser = mtd_part_get_compatible_parser(compat); if (!parser) continue; ret = mtd_part_do_parse(parser, master, pparts, NULL); if (ret > 0) { of_platform_populate(np, NULL, NULL, dev); of_node_put(np); return ret; } mtd_part_parser_put(parser); if (ret < 0 && !err) err = ret; } of_platform_populate(np, NULL, NULL, dev); of_node_put(np); /* * For backward compatibility we have to try the "fixed-partitions" * parser. It supports old DT format with partitions specified as a * direct subnodes of a flash device DT node without any compatibility * specified we could match. */ parser = mtd_part_parser_get(fixed); if (!parser && !request_module("%s", fixed)) parser = mtd_part_parser_get(fixed); if (parser) { ret = mtd_part_do_parse(parser, master, pparts, NULL); if (ret > 0) return ret; mtd_part_parser_put(parser); if (ret < 0 && !err) err = ret; } return err; } /** * parse_mtd_partitions - parse and register MTD partitions * * @master: the master partition (describes whole MTD device) * @types: names of partition parsers to try or %NULL * @data: MTD partition parser-specific data * * This function tries to find & register partitions on MTD device @master. It * uses MTD partition parsers, specified in @types. However, if @types is %NULL, * then the default list of parsers is used. The default list contains only the * "cmdlinepart" and "ofpart" parsers ATM. * Note: If there are more then one parser in @types, the kernel only takes the * partitions parsed out by the first parser. * * This function may return: * o a negative error code in case of failure * o number of found partitions otherwise */ int parse_mtd_partitions(struct mtd_info *master, const char *const *types, struct mtd_part_parser_data *data) { struct mtd_partitions pparts = { }; struct mtd_part_parser *parser; int ret, err = 0; if (!types) types = mtd_is_partition(master) ? default_subpartition_types : default_mtd_part_types; for ( ; *types; types++) { /* * ofpart is a special type that means OF partitioning info * should be used. It requires a bit different logic so it is * handled in a separated function. */ if (!strcmp(*types, "ofpart")) { ret = mtd_part_of_parse(master, &pparts); } else { pr_debug("%s: parsing partitions %s\n", master->name, *types); parser = mtd_part_parser_get(*types); if (!parser && !request_module("%s", *types)) parser = mtd_part_parser_get(*types); if (!parser) continue; pr_debug("%s: got parser %s\n", master->name, parser->name); ret = mtd_part_do_parse(parser, master, &pparts, data); if (ret <= 0) mtd_part_parser_put(parser); } /* Found partitions! */ if (ret > 0) { err = add_mtd_partitions(master, pparts.parts, pparts.nr_parts); mtd_part_parser_cleanup(&pparts); return err ? err : pparts.nr_parts; } /* * Stash the first error we see; only report it if no parser * succeeds */ if (ret < 0 && !err) err = ret; } return err; } void mtd_part_parser_cleanup(struct mtd_partitions *parts) { const struct mtd_part_parser *parser; if (!parts) return; parser = parts->parser; if (parser) { if (parser->cleanup) parser->cleanup(parts->parts, parts->nr_parts); mtd_part_parser_put(parser); } } /* Returns the size of the entire flash chip */ uint64_t mtd_get_device_size(const struct mtd_info *mtd) { struct mtd_info *master = mtd_get_master((struct mtd_info *)mtd); return master->size; } EXPORT_SYMBOL_GPL(mtd_get_device_size);
591 222 108 141 336 336 335 333 16 331 331 285 30 33 26 5 5 5 975 974 60 2 4 58 5 5 973 58 861 910 932 929 925 930 307 4 4 308 312 932 869 870 867 856 254 249 240 869 879 1035 894 892 890 883 1033 217 217 99 98 99 99 89 89 95 119 6 6 6 6 4 113 36 37 923 1031 427 345 349 340 428 115 206 116 116 117 118 116 109 89 89 4 2 1 5 5 5 5 5 5 3 5 5 5 5 5 3 2 277 278 278 277 3 278 2 278 2 277 53 278 5 5 278 278 1 278 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 // SPDX-License-Identifier: GPL-2.0 /* * SUCS NET3: * * Generic datagram handling routines. These are generic for all * protocols. Possibly a generic IP version on top of these would * make sense. Not tonight however 8-). * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and * NetROM layer all have identical poll code and mostly * identical recvmsg() code. So we share it here. The poll was * shared before but buried in udp.c so I moved it. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old * udp.c code) * * Fixes: * Alan Cox : NULL return from skb_peek_copy() * understood * Alan Cox : Rewrote skb_read_datagram to avoid the * skb_peek_copy stuff. * Alan Cox : Added support for SOCK_SEQPACKET. * IPX can no longer use the SO_TYPE hack * but AX.25 now works right, and SPX is * feasible. * Alan Cox : Fixed write poll of non IP protocol * crash. * Florian La Roche: Changed for my new skbuff handling. * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. * Linus Torvalds : BSD semantic fixes. * Alan Cox : Datagram iovec handling * Darryl Miles : Fixed non-blocking SOCK_STREAM. * Alan Cox : POSIXisms * Pete Wyckoff : Unconnected accept() fix. * */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/uaccess.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/poll.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/iov_iter.h> #include <linux/indirect_call_wrapper.h> #include <linux/crc32.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/checksum.h> #include <net/sock.h> #include <net/tcp_states.h> #include <trace/events/skb.h> #include <net/busy_poll.h> #include "devmem.h" /* * Is a socket 'connection oriented' ? */ static inline int connection_based(struct sock *sk) { return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; } static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { /* * Avoid a wakeup if event not interesting for us */ if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) return 0; return autoremove_wake_function(wait, mode, sync, key); } /* * Wait for the last received packet to be different from skb */ int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb) { int error; DEFINE_WAIT_FUNC(wait, receiver_wake_function); prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* Socket errors? */ error = sock_error(sk); if (error) goto out_err; if (READ_ONCE(queue->prev) != skb) goto out; /* Socket shut down? */ if (sk->sk_shutdown & RCV_SHUTDOWN) goto out_noerr; /* Sequenced packets can come disconnected. * If so we report the problem */ error = -ENOTCONN; if (connection_based(sk) && !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) goto out_err; /* handle signals */ if (signal_pending(current)) goto interrupted; error = 0; *timeo_p = schedule_timeout(*timeo_p); out: finish_wait(sk_sleep(sk), &wait); return error; interrupted: error = sock_intr_errno(*timeo_p); out_err: *err = error; goto out; out_noerr: *err = 0; error = 1; goto out; } EXPORT_SYMBOL(__skb_wait_for_more_packets); static struct sk_buff *skb_set_peeked(struct sk_buff *skb) { struct sk_buff *nskb; if (skb->peeked) return skb; /* We have to unshare an skb before modifying it. */ if (!skb_shared(skb)) goto done; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return ERR_PTR(-ENOMEM); skb->prev->next = nskb; skb->next->prev = nskb; nskb->prev = skb->prev; nskb->next = skb->next; consume_skb(skb); skb = nskb; done: skb->peeked = 1; return skb; } struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) { bool peek_at_off = false; struct sk_buff *skb; int _off = 0; if (unlikely(flags & MSG_PEEK && *off >= 0)) { peek_at_off = true; _off = *off; } *last = queue->prev; skb_queue_walk(queue, skb) { if (flags & MSG_PEEK) { if (peek_at_off && _off >= skb->len && (_off || skb->peeked)) { _off -= skb->len; continue; } if (!skb->len) { skb = skb_set_peeked(skb); if (IS_ERR(skb)) { *err = PTR_ERR(skb); return NULL; } } refcount_inc(&skb->users); } else { __skb_unlink(skb, queue); } *off = _off; return skb; } return NULL; } /** * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket * @queue: socket queue from which to receive * @flags: MSG\_ flags * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts * @err: error code returned * @last: set to last peeked message to inform the wait function * what to look for when peeking * * Get a datagram skbuff, understands the peeking, nonblocking wakeups * and possible races. This replaces identical code in packet, raw and * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes * the long standing peek and read race for datagram sockets. If you * alter this routine remember it must be re-entrant. * * This function will lock the socket if a skb is returned, so * the caller needs to unlock the socket in that case (usually by * calling skb_free_datagram). Returns NULL with @err set to * -EAGAIN if no data was available or to some other value if an * error was detected. * * * It does not lock socket since today. This function is * * free of race conditions. This measure should/can improve * * significantly datagram socket latencies at high loads, * * when data copying to user space takes lots of time. * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet * * 8) Great win.) * * --ANK (980729) * * The order of the tests when we find no data waiting are specified * quite explicitly by POSIX 1003.1g, don't change them without having * the standard around please. */ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) { struct sk_buff *skb; unsigned long cpu_flags; /* * Caller is allowed not to check sk->sk_err before skb_recv_datagram() */ int error = sock_error(sk); if (error) goto no_packet; do { /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. * * Look at current nfs client by the way... * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); skb = __skb_try_recv_from_queue(queue, flags, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) goto no_packet; if (skb) return skb; if (!sk_can_busy_loop(sk)) break; sk_busy_loop(sk, flags & MSG_DONTWAIT); } while (READ_ONCE(queue->prev) != *last); error = -EAGAIN; no_packet: *err = error; return NULL; } EXPORT_SYMBOL(__skb_try_recv_datagram); struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, unsigned int flags, int *off, int *err) { struct sk_buff *skb, *last; long timeo; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err, &last); if (skb) return skb; if (*err != -EAGAIN) break; } while (timeo && !__skb_wait_for_more_packets(sk, sk_queue, err, &timeo, last)); return NULL; } EXPORT_SYMBOL(__skb_recv_datagram); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err) { int off = 0; return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { consume_skb(skb); } EXPORT_SYMBOL(skb_free_datagram); int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)) { int err = 0; if (flags & MSG_PEEK) { err = -ENOENT; spin_lock_bh(&sk_queue->lock); if (skb->next) { __skb_unlink(skb, sk_queue); refcount_dec(&skb->users); if (destructor) destructor(sk, skb); err = 0; } spin_unlock_bh(&sk_queue->lock); } sk_drops_inc(sk); return err; } EXPORT_SYMBOL(__sk_queue_drop_skb); /** * skb_kill_datagram - Free a datagram skbuff forcibly * @sk: socket * @skb: datagram skbuff * @flags: MSG\_ flags * * This function frees a datagram skbuff that was received by * skb_recv_datagram. The flags argument must match the one * used for skb_recv_datagram. * * If the MSG_PEEK flag is set, and the packet is still on the * receive queue of the socket, it will be taken off the queue * before it is freed. * * This function currently only disables BH when acquiring the * sk_receive_queue lock. Therefore it must not be used in a * context where that lock is acquired in an IRQ context. * * It returns 0 if the packet was removed by us. */ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) { int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags, NULL); kfree_skb(skb); return err; } EXPORT_SYMBOL(skb_kill_datagram); INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i)); static int __skb_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, bool fault_short, size_t (*cb)(const void *, size_t, void *, struct iov_iter *), void *data) { int start = skb_headlen(skb); int i, copy = start - offset, start_off = offset, n; struct sk_buff *frag_iter; /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; n = INDIRECT_CALL_1(cb, simple_copy_to_iter, skb->data + offset, copy, data, to); offset += n; if (n != copy) goto short_copy; if ((len -= copy) == 0) return 0; } if (!skb_frags_readable(skb)) goto short_copy; /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { u32 p_off, p_len, copied; struct page *p; u8 *vaddr; if (copy > len) copy = len; n = 0; skb_frag_foreach_page(frag, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_local_page(p); n += INDIRECT_CALL_1(cb, simple_copy_to_iter, vaddr + p_off, p_len, data, to); kunmap_local(vaddr); } offset += n; if (n != copy) goto short_copy; if (!(len -= copy)) return 0; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (__skb_datagram_iter(frag_iter, offset - start, to, copy, fault_short, cb, data)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } start = end; } if (!len) return 0; /* This is not really a user copy fault, but rather someone * gave us a bogus length on the skb. We should probably * print a warning here as it may indicate a kernel bug. */ fault: iov_iter_revert(to, offset - start_off); return -EFAULT; short_copy: if (fault_short || iov_iter_count(to)) goto fault; return 0; } #ifdef CONFIG_NET_CRC32C static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes, void *_crcp, struct iov_iter *i) { u32 *crcp = _crcp; size_t copied; copied = copy_to_iter(addr, bytes, i); *crcp = crc32c(*crcp, addr, copied); return copied; } /** * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator * and update a CRC32C value. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec * @crcp: pointer to CRC32C value to update * * Return: 0 on success, -EFAULT if there was a fault during copy. */ int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, u32 *crcp) { return __skb_datagram_iter(skb, offset, to, len, true, crc32c_and_copy_to_iter, crcp); } EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter); #endif /* CONFIG_NET_CRC32C */ static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i) { return copy_to_iter(addr, bytes, i); } /** * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec */ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len) { trace_skb_copy_datagram_iovec(skb, len); return __skb_datagram_iter(skb, offset, to, len, false, simple_copy_to_iter, NULL); } EXPORT_SYMBOL(skb_copy_datagram_iter); /** * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter. * @skb: buffer to copy * @offset: offset in the buffer to start copying to * @from: the copy source * @len: amount of data to copy to buffer from iovec * * Returns 0 or -EFAULT. */ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; if (copy_from_iter(skb->data + offset, copy, from) != copy) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { size_t copied; if (copy > len) copy = len; copied = copy_page_from_iter(skb_frag_page(frag), skb_frag_off(frag) + offset - start, copy, from); if (copied != copy) goto fault; if (!(len -= copy)) return 0; offset += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (skb_copy_datagram_from_iter(frag_iter, offset - start, from, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } start = end; } if (!len) return 0; fault: return -EFAULT; } EXPORT_SYMBOL(skb_copy_datagram_from_iter); int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset, struct iov_iter *from, int len) { struct iov_iter_state state; int ret; iov_iter_save_state(from, &state); ret = skb_copy_datagram_from_iter(skb, offset, from, len); if (ret) iov_iter_restore(from, &state); return ret; } EXPORT_SYMBOL(skb_copy_datagram_from_iter_full); int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length) { int frag = skb_shinfo(skb)->nr_frags; if (!skb_frags_readable(skb)) return -EFAULT; while (length && iov_iter_count(from)) { struct page *head, *last_head = NULL; struct page *pages[MAX_SKB_FRAGS]; int refs, order, n = 0; size_t start; ssize_t copied; if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; copied = iov_iter_get_pages2(from, pages, length, MAX_SKB_FRAGS - frag, &start); if (copied < 0) return -EFAULT; length -= copied; skb->data_len += copied; skb->len += copied; skb->truesize += PAGE_ALIGN(copied + start); head = compound_head(pages[n]); order = compound_order(head); for (refs = 0; copied != 0; start = 0) { int size = min_t(int, copied, PAGE_SIZE - start); if (pages[n] - head > (1UL << order) - 1) { head = compound_head(pages[n]); order = compound_order(head); } start += (pages[n] - head) << PAGE_SHIFT; copied -= size; n++; if (frag) { skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1]; if (head == skb_frag_page(last) && start == skb_frag_off(last) + skb_frag_size(last)) { skb_frag_size_add(last, size); /* We combined this page, we need to release * a reference. Since compound pages refcount * is shared among many pages, batch the refcount * adjustments to limit false sharing. */ last_head = head; refs++; continue; } } if (refs) { page_ref_sub(last_head, refs); refs = 0; } skb_fill_page_desc_noacc(skb, frag++, head, start, size); } if (refs) page_ref_sub(last_head, refs); } return 0; } static int zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, int length, struct net_devmem_dmabuf_binding *binding) { int i = skb_shinfo(skb)->nr_frags; size_t virt_addr, size, off; struct net_iov *niov; /* Devmem filling works by taking an IOVEC from the user where the * iov_addrs are interpreted as an offset in bytes into the dma-buf to * send from. We do not support other iter types. */ if (iov_iter_type(from) != ITER_IOVEC && iov_iter_type(from) != ITER_UBUF) return -EFAULT; while (length && iov_iter_count(from)) { if (i == MAX_SKB_FRAGS) return -EMSGSIZE; virt_addr = (size_t)iter_iov_addr(from); niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); if (!niov) return -EFAULT; size = min_t(size_t, size, length); size = min_t(size_t, size, iter_iov_len(from)); get_netmem(net_iov_to_netmem(niov)); skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, size, PAGE_SIZE); iov_iter_advance(from, size); length -= size; i++; } return 0; } int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length, struct net_devmem_dmabuf_binding *binding) { unsigned long orig_size = skb->truesize; unsigned long truesize; int ret; if (msg && msg->msg_ubuf && msg->sg_from_iter) ret = msg->sg_from_iter(skb, from, length); else if (binding) ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); else ret = zerocopy_fill_skb_from_iter(skb, from, length); truesize = skb->truesize - orig_size; if (sk && sk->sk_type == SOCK_STREAM) { sk_wmem_queued_add(sk, truesize); if (!skb_zcopy_pure(skb)) sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); } return ret; } EXPORT_SYMBOL(__zerocopy_sg_from_iter); /** * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter * @skb: buffer to copy * @from: the source to copy from * * The function will first copy up to headlen, and then pin the userspace * pages and build frags through them. * * Returns 0, -EFAULT or -EMSGSIZE. */ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) { int copy = min_t(int, skb_headlen(skb), iov_iter_count(from)); /* copy up to skb headlen */ if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); } EXPORT_SYMBOL(zerocopy_sg_from_iter); static __always_inline size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { __wsum next, *csum = priv2; next = csum_and_copy_to_user(from + progress, iter_to, len); *csum = csum_block_add(*csum, next, progress); return next ? 0 : len; } static __always_inline size_t memcpy_to_iter_csum(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { __wsum *csum = priv2; __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len); *csum = csum_block_add(*csum, next, progress); return 0; } struct csum_state { __wsum csum; size_t off; }; static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, struct iov_iter *i) { struct csum_state *csstate = _csstate; __wsum sum; if (WARN_ON_ONCE(i->data_source)) return 0; if (unlikely(iov_iter_is_discard(i))) { // can't use csum_memcpy() for that one - data is not copied csstate->csum = csum_block_add(csstate->csum, csum_partial(addr, bytes, 0), csstate->off); csstate->off += bytes; return bytes; } sum = csum_shift(csstate->csum, csstate->off); bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum, copy_to_user_iter_csum, memcpy_to_iter_csum); csstate->csum = csum_shift(sum, csstate->off); csstate->off += bytes; return bytes; } /** * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator * and update a checksum. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec * @csump: checksum pointer */ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, __wsum *csump) { struct csum_state csdata = { .csum = *csump }; int ret; ret = __skb_datagram_iter(skb, offset, to, len, true, csum_and_copy_to_iter, &csdata); if (ret) return ret; *csump = csdata.csum; return 0; } /** * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff * @hlen: hardware length * @msg: destination * * Caller _must_ check that skb will fit to this iovec. * * Returns: 0 - success. * -EINVAL - checksum failure. * -EFAULT - fault during copy. */ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg) { __wsum csum; int chunk = skb->len - hlen; if (!chunk) return 0; if (msg_data_left(msg) < chunk) { if (__skb_checksum_complete(skb)) return -EINVAL; if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) goto fault; } else { csum = csum_partial(skb->data, hlen, skb->csum); if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, chunk, &csum)) goto fault; if (csum_fold(csum)) { iov_iter_revert(&msg->msg_iter, chunk); return -EINVAL; } if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(NULL, skb); } return 0; fault: return -EFAULT; } EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** * datagram_poll_queue - same as datagram_poll, but on a specific receive * queue * @file: file struct * @sock: socket * @wait: poll table * @rcv_queue: receive queue to poll * * Performs polling on the given receive queue, handling shutdown, error, * and connection state. This is useful for protocols that deliver * userspace-bound packets through a custom queue instead of * sk->sk_receive_queue. * * Return: poll bitmask indicating the socket's current state */ __poll_t datagram_poll_queue(struct file *file, struct socket *sock, poll_table *wait, struct sk_buff_head *rcv_queue) { struct sock *sk = sock->sk; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ if (!skb_queue_empty_lockless(rcv_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (connection_based(sk)) { int state = READ_ONCE(sk->sk_state); if (state == TCP_CLOSE) mask |= EPOLLHUP; /* connection hasn't started yet? */ if (state == TCP_SYN_SENT) return mask; } /* writable? */ if (sock_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } EXPORT_SYMBOL(datagram_poll_queue); /** * datagram_poll - generic datagram poll * @file: file struct * @sock: socket * @wait: poll table * * Datagram poll: Again totally generic. This also handles * sequenced packet sockets providing the socket receive queue * is only ever holding data ready to receive. * * Note: when you *don't* use this routine for this protocol, * and you use a different write policy from sock_writeable() * then please supply your own write_space callback. * * Return: poll bitmask indicating the socket's current state */ __poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait) { return datagram_poll_queue(file, sock, wait, &sock->sk->sk_receive_queue); } EXPORT_SYMBOL(datagram_poll);
28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 // SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * emulate.c * * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. * * Copyright (c) 2005 Keir Fraser * * Linux coding style, mod r/m decoder, segment base fixes, real-mode * privileged instructions: * * Copyright (C) 2006 Qumranet * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> * * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include <linux/stringify.h> #include <asm/debugreg.h> #include <asm/nospec-branch.h> #include <asm/ibt.h> #include <asm/text-patching.h> #include "x86.h" #include "tss.h" #include "mmu.h" #include "pmu.h" /* * Operand types */ #define OpNone 0ull #define OpImplicit 1ull /* No generic decode */ #define OpReg 2ull /* Register */ #define OpMem 3ull /* Memory */ #define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */ #define OpDI 5ull /* ES:DI/EDI/RDI */ #define OpMem64 6ull /* Memory, 64-bit */ #define OpImmUByte 7ull /* Zero-extended 8-bit immediate */ #define OpDX 8ull /* DX register */ #define OpCL 9ull /* CL register (for shifts) */ #define OpImmByte 10ull /* 8-bit sign extended immediate */ #define OpOne 11ull /* Implied 1 */ #define OpImm 12ull /* Sign extended up to 32-bit immediate */ #define OpMem16 13ull /* Memory operand (16-bit). */ #define OpMem32 14ull /* Memory operand (32-bit). */ #define OpImmU 15ull /* Immediate operand, zero extended */ #define OpSI 16ull /* SI/ESI/RSI */ #define OpImmFAddr 17ull /* Immediate far address */ #define OpMemFAddr 18ull /* Far address in memory */ #define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */ #define OpES 20ull /* ES */ #define OpCS 21ull /* CS */ #define OpSS 22ull /* SS */ #define OpDS 23ull /* DS */ #define OpFS 24ull /* FS */ #define OpGS 25ull /* GS */ #define OpMem8 26ull /* 8-bit zero extended memory operand */ #define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ #define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ #define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */ #define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) /* * Opcode effective-address decode tables. * Note that we only emulate instructions that have at least one memory * operand (excluding implicit stack references). We assume that stack * references and instruction fetches will never occur in special memory * areas that require emulation. So, for example, 'mov <imm>,<reg>' need * not be handled. */ /* Operand sizes: 8-bit operands or specified/overridden size. */ #define ByteOp (1<<0) /* 8-bit operands. */ /* Destination operand type. */ #define DstShift 1 #define ImplicitOps (OpImplicit << DstShift) #define DstReg (OpReg << DstShift) #define DstMem (OpMem << DstShift) #define DstAcc (OpAcc << DstShift) #define DstDI (OpDI << DstShift) #define DstMem64 (OpMem64 << DstShift) #define DstMem16 (OpMem16 << DstShift) #define DstImmUByte (OpImmUByte << DstShift) #define DstDX (OpDX << DstShift) #define DstAccLo (OpAccLo << DstShift) #define DstMask (OpMask << DstShift) /* Source operand type. */ #define SrcShift 6 #define SrcNone (OpNone << SrcShift) #define SrcReg (OpReg << SrcShift) #define SrcMem (OpMem << SrcShift) #define SrcMem16 (OpMem16 << SrcShift) #define SrcMem32 (OpMem32 << SrcShift) #define SrcImm (OpImm << SrcShift) #define SrcImmByte (OpImmByte << SrcShift) #define SrcOne (OpOne << SrcShift) #define SrcImmUByte (OpImmUByte << SrcShift) #define SrcImmU (OpImmU << SrcShift) #define SrcSI (OpSI << SrcShift) #define SrcXLat (OpXLat << SrcShift) #define SrcImmFAddr (OpImmFAddr << SrcShift) #define SrcMemFAddr (OpMemFAddr << SrcShift) #define SrcAcc (OpAcc << SrcShift) #define SrcImmU16 (OpImmU16 << SrcShift) #define SrcImm64 (OpImm64 << SrcShift) #define SrcDX (OpDX << SrcShift) #define SrcMem8 (OpMem8 << SrcShift) #define SrcAccHi (OpAccHi << SrcShift) #define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ #define String (1<<13) /* String instruction (rep capable) */ #define Stack (1<<14) /* Stack instruction (push/pop) */ #define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */ #define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ #define Escape (5<<15) /* Escape to coprocessor instruction */ #define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ #define ModeDual (7<<15) /* Different instruction for 32/64 bit */ #define Sse (1<<18) /* SSE Vector instruction */ /* Generic ModRM decode. */ #define ModRM (1<<19) /* Destination is only written; never read. */ #define Mov (1<<20) /* Misc flags */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) #define PageTable (1 << 29) /* instruction used to write page table */ #define NotImpl (1 << 30) /* instruction is not implemented */ /* Source 2 operand type */ #define Src2Shift (31) #define Src2None (OpNone << Src2Shift) #define Src2Mem (OpMem << Src2Shift) #define Src2CL (OpCL << Src2Shift) #define Src2ImmByte (OpImmByte << Src2Shift) #define Src2One (OpOne << Src2Shift) #define Src2Imm (OpImm << Src2Shift) #define Src2ES (OpES << Src2Shift) #define Src2CS (OpCS << Src2Shift) #define Src2SS (OpSS << Src2Shift) #define Src2DS (OpDS << Src2Shift) #define Src2FS (OpFS << Src2Shift) #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ #define AlignMask ((u64)7 << 41) #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ #define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ #define Avx ((u64)3 << 41) /* Advanced Vector Extensions */ #define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ #define Intercept ((u64)1 << 48) /* Has valid intercept field */ #define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ #define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ #define NearBranch ((u64)1 << 52) /* Near branches */ #define No16 ((u64)1 << 53) /* No 16 bit operand */ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ #define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ #define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */ #define ShadowStack ((u64)1 << 57) /* Instruction affects Shadow Stacks. */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) #define X2(x...) x, x #define X3(x...) X2(x), x #define X4(x...) X2(x), X2(x) #define X5(x...) X4(x), x #define X6(x...) X4(x), X2(x) #define X7(x...) X4(x), X3(x) #define X8(x...) X4(x), X4(x) #define X16(x...) X8(x), X8(x) struct opcode { u64 flags; u8 intercept; u8 pad[7]; union { int (*execute)(struct x86_emulate_ctxt *ctxt); const struct opcode *group; const struct group_dual *gdual; const struct gprefix *gprefix; const struct escape *esc; const struct instr_dual *idual; const struct mode_dual *mdual; } u; int (*check_perm)(struct x86_emulate_ctxt *ctxt); }; struct group_dual { struct opcode mod012[8]; struct opcode mod3[8]; }; struct gprefix { struct opcode pfx_no; struct opcode pfx_66; struct opcode pfx_f2; struct opcode pfx_f3; }; struct escape { struct opcode op[8]; struct opcode high[64]; }; struct instr_dual { struct opcode mod012; struct opcode mod3; }; struct mode_dual { struct opcode mode32; struct opcode mode64; }; #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a enum x86_transfer_type { X86_TRANSFER_NONE, X86_TRANSFER_CALL_JMP, X86_TRANSFER_RET, X86_TRANSFER_TASK_SWITCH, }; static void writeback_registers(struct x86_emulate_ctxt *ctxt) { unsigned long dirty = ctxt->regs_dirty; unsigned reg; for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS) ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); } static void invalidate_registers(struct x86_emulate_ctxt *ctxt) { ctxt->regs_dirty = 0; ctxt->regs_valid = 0; } /* * These EFLAGS bits are restored from saved value during emulation, and * any changes are written back to the saved value after emulation. */ #define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\ X86_EFLAGS_PF|X86_EFLAGS_CF) #ifdef CONFIG_X86_64 #define ON64(x...) x #else #define ON64(x...) #endif #define EM_ASM_START(op) \ static int em_##op(struct x86_emulate_ctxt *ctxt) \ { \ unsigned long flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; \ int bytes = 1, ok = 1; \ if (!(ctxt->d & ByteOp)) \ bytes = ctxt->dst.bytes; \ switch (bytes) { #define __EM_ASM(str) \ asm("push %[flags]; popf \n\t" \ "10: " str \ "pushf; pop %[flags] \n\t" \ "11: \n\t" \ : "+a" (ctxt->dst.val), \ "+d" (ctxt->src.val), \ [flags] "+D" (flags), \ "+S" (ok) \ : "c" (ctxt->src2.val)) #define __EM_ASM_1(op, dst) \ __EM_ASM(#op " %%" #dst " \n\t") #define __EM_ASM_1_EX(op, dst) \ __EM_ASM(#op " %%" #dst " \n\t" \ _ASM_EXTABLE_TYPE_REG(10b, 11f, EX_TYPE_ZERO_REG, %%esi)) #define __EM_ASM_2(op, dst, src) \ __EM_ASM(#op " %%" #src ", %%" #dst " \n\t") #define __EM_ASM_3(op, dst, src, src2) \ __EM_ASM(#op " %%" #src2 ", %%" #src ", %%" #dst " \n\t") #define EM_ASM_END \ } \ ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); \ return !ok ? emulate_de(ctxt) : X86EMUL_CONTINUE; \ } /* 1-operand, using "a" (dst) */ #define EM_ASM_1(op) \ EM_ASM_START(op) \ case 1: __EM_ASM_1(op##b, al); break; \ case 2: __EM_ASM_1(op##w, ax); break; \ case 4: __EM_ASM_1(op##l, eax); break; \ ON64(case 8: __EM_ASM_1(op##q, rax); break;) \ EM_ASM_END /* 1-operand, using "c" (src2) */ #define EM_ASM_1SRC2(op, name) \ EM_ASM_START(name) \ case 1: __EM_ASM_1(op##b, cl); break; \ case 2: __EM_ASM_1(op##w, cx); break; \ case 4: __EM_ASM_1(op##l, ecx); break; \ ON64(case 8: __EM_ASM_1(op##q, rcx); break;) \ EM_ASM_END /* 1-operand, using "c" (src2) with exception */ #define EM_ASM_1SRC2EX(op, name) \ EM_ASM_START(name) \ case 1: __EM_ASM_1_EX(op##b, cl); break; \ case 2: __EM_ASM_1_EX(op##w, cx); break; \ case 4: __EM_ASM_1_EX(op##l, ecx); break; \ ON64(case 8: __EM_ASM_1_EX(op##q, rcx); break;) \ EM_ASM_END /* 2-operand, using "a" (dst), "d" (src) */ #define EM_ASM_2(op) \ EM_ASM_START(op) \ case 1: __EM_ASM_2(op##b, al, dl); break; \ case 2: __EM_ASM_2(op##w, ax, dx); break; \ case 4: __EM_ASM_2(op##l, eax, edx); break; \ ON64(case 8: __EM_ASM_2(op##q, rax, rdx); break;) \ EM_ASM_END /* 2-operand, reversed */ #define EM_ASM_2R(op, name) \ EM_ASM_START(name) \ case 1: __EM_ASM_2(op##b, dl, al); break; \ case 2: __EM_ASM_2(op##w, dx, ax); break; \ case 4: __EM_ASM_2(op##l, edx, eax); break; \ ON64(case 8: __EM_ASM_2(op##q, rdx, rax); break;) \ EM_ASM_END /* 2-operand, word only (no byte op) */ #define EM_ASM_2W(op) \ EM_ASM_START(op) \ case 1: break; \ case 2: __EM_ASM_2(op##w, ax, dx); break; \ case 4: __EM_ASM_2(op##l, eax, edx); break; \ ON64(case 8: __EM_ASM_2(op##q, rax, rdx); break;) \ EM_ASM_END /* 2-operand, using "a" (dst) and CL (src2) */ #define EM_ASM_2CL(op) \ EM_ASM_START(op) \ case 1: __EM_ASM_2(op##b, al, cl); break; \ case 2: __EM_ASM_2(op##w, ax, cl); break; \ case 4: __EM_ASM_2(op##l, eax, cl); break; \ ON64(case 8: __EM_ASM_2(op##q, rax, cl); break;) \ EM_ASM_END /* 3-operand, using "a" (dst), "d" (src) and CL (src2) */ #define EM_ASM_3WCL(op) \ EM_ASM_START(op) \ case 1: break; \ case 2: __EM_ASM_3(op##w, ax, dx, cl); break; \ case 4: __EM_ASM_3(op##l, eax, edx, cl); break; \ ON64(case 8: __EM_ASM_3(op##q, rax, rdx, cl); break;) \ EM_ASM_END static int em_salc(struct x86_emulate_ctxt *ctxt) { /* * Set AL 0xFF if CF is set, or 0x00 when clear. */ ctxt->dst.val = 0xFF * !!(ctxt->eflags & X86_EFLAGS_CF); return X86EMUL_CONTINUE; } /* * XXX: inoutclob user must know where the argument is being expanded. * Using asm goto would allow us to remove _fault. */ #define asm_safe(insn, inoutclob...) \ ({ \ int _fault = 0; \ \ asm volatile("1:" insn "\n" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[_fault]) \ : [_fault] "+r"(_fault) inoutclob ); \ \ _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \ }) static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) { struct x86_instruction_info info = { .intercept = intercept, .rep_prefix = ctxt->rep_prefix, .modrm_mod = ctxt->modrm_mod, .modrm_reg = ctxt->modrm_reg, .modrm_rm = ctxt->modrm_rm, .src_val = ctxt->src.val64, .dst_val = ctxt->dst.val64, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, .src_type = ctxt->src.type, .dst_type = ctxt->dst.type, .ad_bytes = ctxt->ad_bytes, .rip = ctxt->eip, .next_rip = ctxt->_eip, }; return ctxt->ops->intercept(ctxt, &info, stage); } static void assign_masked(ulong *dest, ulong src, ulong mask) { *dest = (*dest & ~mask) | (src & mask); } static void assign_register(unsigned long *reg, u64 val, int bytes) { /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ switch (bytes) { case 1: *(u8 *)reg = (u8)val; break; case 2: *(u16 *)reg = (u16)val; break; case 4: *reg = (u32)val; break; /* 64b: zero-extend */ case 8: *reg = val; break; } } static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) { return (1UL << (ctxt->ad_bytes << 3)) - 1; } static ulong stack_mask(struct x86_emulate_ctxt *ctxt) { u16 sel; struct desc_struct ss; if (ctxt->mode == X86EMUL_MODE_PROT64) return ~0UL; ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS); return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */ } static int stack_size(struct x86_emulate_ctxt *ctxt) { return (__fls(stack_mask(ctxt)) + 1) >> 3; } /* Access/update address held in a register, based on addressing mode. */ static inline unsigned long address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) { if (ctxt->ad_bytes == sizeof(unsigned long)) return reg; else return reg & ad_mask(ctxt); } static inline unsigned long register_address(struct x86_emulate_ctxt *ctxt, int reg) { return address_mask(ctxt, reg_read(ctxt, reg)); } static void masked_increment(ulong *reg, ulong mask, int inc) { assign_masked(reg, *reg + inc, mask); } static inline void register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc) { ulong *preg = reg_rmw(ctxt, reg); assign_register(preg, *preg + inc, ctxt->ad_bytes); } static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) { masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc); } static u32 desc_limit_scaled(struct desc_struct *desc) { u32 limit = get_desc_limit(desc); return desc->g ? (limit << 12) | 0xfff : limit; } static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; return ctxt->ops->get_cached_segment_base(ctxt, seg); } static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt)) return X86EMUL_UNHANDLEABLE; ctxt->exception.vector = vec; ctxt->exception.error_code = error; ctxt->exception.error_code_valid = valid; return X86EMUL_PROPAGATE_FAULT; } static int emulate_db(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, DB_VECTOR, 0, false); } static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) { return emulate_exception(ctxt, GP_VECTOR, err, true); } static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err) { return emulate_exception(ctxt, SS_VECTOR, err, true); } static int emulate_ud(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, UD_VECTOR, 0, false); } static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err) { return emulate_exception(ctxt, TS_VECTOR, err, true); } static int emulate_de(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, DE_VECTOR, 0, false); } static int emulate_nm(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, NM_VECTOR, 0, false); } static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) { u16 selector; struct desc_struct desc; ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg); return selector; } static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, unsigned seg) { u16 dummy; u32 base3; struct desc_struct desc; ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg); ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); } static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt) { return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48; } static inline bool emul_is_noncanonical_address(u64 la, struct x86_emulate_ctxt *ctxt, unsigned int flags) { return !ctxt->ops->is_canonical_addr(ctxt, la, flags); } /* * x86 defines three classes of vector instructions: explicitly * aligned, explicitly unaligned, and the rest, which change behaviour * depending on whether they're AVX encoded or not. * * Also included is CMPXCHG16B which is not a vector instruction, yet it is * subject to the same check. FXSAVE and FXRSTOR are checked here too as their * 512 bytes of data must be aligned to a 16 byte boundary. */ static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size) { u64 alignment = ctxt->d & AlignMask; if (likely(size < 16)) return 1; switch (alignment) { case Unaligned: case Avx: return 1; case Aligned16: return 16; case Aligned: default: return size; } } static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, unsigned *max_size, unsigned size, enum x86emul_mode mode, ulong *linear, unsigned int flags) { struct desc_struct desc; bool usable; ulong la; u32 lim; u16 sel; u8 va_bits; la = seg_base(ctxt, addr.seg) + addr.ea; *max_size = 0; switch (mode) { case X86EMUL_MODE_PROT64: *linear = la = ctxt->ops->get_untagged_addr(ctxt, la, flags); va_bits = ctxt_virt_addr_bits(ctxt); if (!__is_canonical_address(la, va_bits)) goto bad; *max_size = min_t(u64, ~0u, (1ull << va_bits) - la); if (size > *max_size) goto bad; break; default: *linear = la = (u32)la; usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL, addr.seg); if (!usable) goto bad; /* code segment in protected mode or read-only data segment */ if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) || !(desc.type & 2)) && (flags & X86EMUL_F_WRITE)) goto bad; /* unreadable code segment */ if (!(flags & X86EMUL_F_FETCH) && (desc.type & 8) && !(desc.type & 2)) goto bad; lim = desc_limit_scaled(&desc); if (!(desc.type & 8) && (desc.type & 4)) { /* expand-down segment */ if (addr.ea <= lim) goto bad; lim = desc.d ? 0xffffffff : 0xffff; } if (addr.ea > lim) goto bad; if (lim == 0xffffffff) *max_size = ~0u; else { *max_size = (u64)lim + 1 - addr.ea; if (size > *max_size) goto bad; } break; } if (la & (insn_alignment(ctxt, size) - 1)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; bad: if (addr.seg == VCPU_SREG_SS) return emulate_ss(ctxt, 0); else return emulate_gp(ctxt, 0); } static int linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, unsigned size, bool write, ulong *linear) { unsigned max_size; return __linearize(ctxt, addr, &max_size, size, ctxt->mode, linear, write ? X86EMUL_F_WRITE : 0); } static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst) { ulong linear; int rc; unsigned max_size; struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = dst }; if (ctxt->op_bytes != sizeof(unsigned long)) addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); rc = __linearize(ctxt, addr, &max_size, 1, ctxt->mode, &linear, X86EMUL_F_FETCH); if (rc == X86EMUL_CONTINUE) ctxt->_eip = addr.ea; return rc; } static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt) { u64 efer; struct desc_struct cs; u16 selector; u32 base3; ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) { /* Real mode. cpu must not have long mode active */ if (efer & EFER_LMA) return X86EMUL_UNHANDLEABLE; ctxt->mode = X86EMUL_MODE_REAL; return X86EMUL_CONTINUE; } if (ctxt->eflags & X86_EFLAGS_VM) { /* Protected/VM86 mode. cpu must not have long mode active */ if (efer & EFER_LMA) return X86EMUL_UNHANDLEABLE; ctxt->mode = X86EMUL_MODE_VM86; return X86EMUL_CONTINUE; } if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS)) return X86EMUL_UNHANDLEABLE; if (efer & EFER_LMA) { if (cs.l) { /* Proper long mode */ ctxt->mode = X86EMUL_MODE_PROT64; } else if (cs.d) { /* 32 bit compatibility mode*/ ctxt->mode = X86EMUL_MODE_PROT32; } else { ctxt->mode = X86EMUL_MODE_PROT16; } } else { /* Legacy 32 bit / 16 bit mode */ ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; } return X86EMUL_CONTINUE; } static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) { return assign_eip(ctxt, dst); } static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst) { int rc = emulator_recalc_and_set_mode(ctxt); if (rc != X86EMUL_CONTINUE) return rc; return assign_eip(ctxt, dst); } static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) { return assign_eip_near(ctxt, ctxt->_eip + rel); } static int linear_read_system(struct x86_emulate_ctxt *ctxt, ulong linear, void *data, unsigned size) { return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, true); } static int linear_write_system(struct x86_emulate_ctxt *ctxt, ulong linear, void *data, unsigned int size) { return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, true); } static int segmented_read_std(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, void *data, unsigned size) { int rc; ulong linear; rc = linearize(ctxt, addr, size, false, &linear); if (rc != X86EMUL_CONTINUE) return rc; return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, false); } static int segmented_write_std(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, void *data, unsigned int size) { int rc; ulong linear; rc = linearize(ctxt, addr, size, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, false); } /* * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. */ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { int rc; unsigned size, max_size; unsigned long linear; int cur_size = ctxt->fetch.end - ctxt->fetch.data; struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = ctxt->eip + cur_size }; /* * We do not know exactly how many bytes will be needed, and * __linearize is expensive, so fetch as much as possible. We * just have to avoid going beyond the 15 byte limit, the end * of the segment, or the end of the page. * * __linearize is called with size 0 so that it does not do any * boundary check itself. Instead, we use max_size to check * against op_size. */ rc = __linearize(ctxt, addr, &max_size, 0, ctxt->mode, &linear, X86EMUL_F_FETCH); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; size = min_t(unsigned, 15UL ^ cur_size, max_size); size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear)); /* * One instruction can only straddle two pages, * and one has been loaded at the beginning of * x86_decode_insn. So, if not enough bytes * still, we must have hit the 15-byte boundary. */ if (unlikely(size < op_size)) return emulate_gp(ctxt, 0); rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, size, &ctxt->exception); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; ctxt->fetch.end += size; return X86EMUL_CONTINUE; } static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, unsigned size) { unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr; if (unlikely(done_size < size)) return __do_insn_fetch_bytes(ctxt, size - done_size); else return X86EMUL_CONTINUE; } /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _ctxt) \ ({ _type _x; \ \ rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ ctxt->_eip += sizeof(_type); \ memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \ ctxt->fetch.ptr += sizeof(_type); \ _x; \ }) #define insn_fetch_arr(_arr, _size, _ctxt) \ ({ \ rc = do_insn_fetch_bytes(_ctxt, _size); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ ctxt->_eip += (_size); \ memcpy(_arr, ctxt->fetch.ptr, _size); \ ctxt->fetch.ptr += (_size); \ }) /* * Given the 'reg' portion of a ModRM byte, and a register block, return a * pointer into the block that addresses the relevant register. * @highbyte_regs specifies whether to decode AH,CH,DH,BH. */ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, int byteop) { void *p; int highbyte_regs = (ctxt->rex_prefix == 0) && byteop; if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; else p = reg_rmw(ctxt, modrm_reg); return p; } static int read_descriptor(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, u16 *size, unsigned long *address, int op_bytes) { int rc; if (op_bytes == 2) op_bytes = 3; *address = 0; rc = segmented_read_std(ctxt, addr, size, 2); if (rc != X86EMUL_CONTINUE) return rc; addr.ea += 2; rc = segmented_read_std(ctxt, addr, address, op_bytes); return rc; } EM_ASM_2(add); EM_ASM_2(or); EM_ASM_2(adc); EM_ASM_2(sbb); EM_ASM_2(and); EM_ASM_2(sub); EM_ASM_2(xor); EM_ASM_2(cmp); EM_ASM_2(test); EM_ASM_2(xadd); EM_ASM_1SRC2(mul, mul_ex); EM_ASM_1SRC2(imul, imul_ex); EM_ASM_1SRC2EX(div, div_ex); EM_ASM_1SRC2EX(idiv, idiv_ex); EM_ASM_3WCL(shld); EM_ASM_3WCL(shrd); EM_ASM_2W(imul); EM_ASM_1(not); EM_ASM_1(neg); EM_ASM_1(inc); EM_ASM_1(dec); EM_ASM_2CL(rol); EM_ASM_2CL(ror); EM_ASM_2CL(rcl); EM_ASM_2CL(rcr); EM_ASM_2CL(shl); EM_ASM_2CL(shr); EM_ASM_2CL(sar); EM_ASM_2W(bsf); EM_ASM_2W(bsr); EM_ASM_2W(bt); EM_ASM_2W(bts); EM_ASM_2W(btr); EM_ASM_2W(btc); EM_ASM_2R(cmp, cmp_r); static int em_bsf_c(struct x86_emulate_ctxt *ctxt) { /* If src is zero, do not writeback, but update flags */ if (ctxt->src.val == 0) ctxt->dst.type = OP_NONE; return em_bsf(ctxt); } static int em_bsr_c(struct x86_emulate_ctxt *ctxt) { /* If src is zero, do not writeback, but update flags */ if (ctxt->src.val == 0) ctxt->dst.type = OP_NONE; return em_bsr(ctxt); } static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) { return __emulate_cc(flags, condition & 0xf); } static void fetch_register_operand(struct operand *op) { switch (op->bytes) { case 1: op->val = *(u8 *)op->addr.reg; break; case 2: op->val = *(u16 *)op->addr.reg; break; case 4: op->val = *(u32 *)op->addr.reg; break; case 8: op->val = *(u64 *)op->addr.reg; break; } } static int em_fninit(struct x86_emulate_ctxt *ctxt) { if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); kvm_fpu_get(); asm volatile("fninit"); kvm_fpu_put(); return X86EMUL_CONTINUE; } static int em_fnstcw(struct x86_emulate_ctxt *ctxt) { u16 fcw; if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); kvm_fpu_get(); asm volatile("fnstcw %0": "+m"(fcw)); kvm_fpu_put(); ctxt->dst.val = fcw; return X86EMUL_CONTINUE; } static int em_fnstsw(struct x86_emulate_ctxt *ctxt) { u16 fsw; if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); kvm_fpu_get(); asm volatile("fnstsw %0": "+m"(fsw)); kvm_fpu_put(); ctxt->dst.val = fsw; return X86EMUL_CONTINUE; } static void decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op) { unsigned int reg; if (ctxt->d & ModRM) reg = ctxt->modrm_reg; else reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); if (ctxt->d & Sse) { op->type = OP_XMM; op->bytes = 16; op->addr.xmm = reg; kvm_read_sse_reg(reg, &op->vec_val); return; } if (ctxt->d & Mmx) { reg &= 7; op->type = OP_MM; op->bytes = 8; op->addr.mm = reg; return; } op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp); fetch_register_operand(op); op->orig_val = op->val; } static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) { if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP) ctxt->modrm_seg = VCPU_SREG_SS; } static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct operand *op) { u8 sib; int index_reg, base_reg, scale; int rc = X86EMUL_CONTINUE; ulong modrm_ea = 0; ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */ index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */ base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */ ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07); ctxt->modrm_seg = VCPU_SREG_DS; if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, ctxt->d & ByteOp); if (ctxt->d & Sse) { op->type = OP_XMM; op->bytes = 16; op->addr.xmm = ctxt->modrm_rm; kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val); return rc; } if (ctxt->d & Mmx) { op->type = OP_MM; op->bytes = 8; op->addr.mm = ctxt->modrm_rm & 7; return rc; } fetch_register_operand(op); return rc; } op->type = OP_MEM; if (ctxt->ad_bytes == 2) { unsigned bx = reg_read(ctxt, VCPU_REGS_RBX); unsigned bp = reg_read(ctxt, VCPU_REGS_RBP); unsigned si = reg_read(ctxt, VCPU_REGS_RSI); unsigned di = reg_read(ctxt, VCPU_REGS_RDI); /* 16-bit ModR/M decode. */ switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 6) modrm_ea += insn_fetch(u16, ctxt); break; case 1: modrm_ea += insn_fetch(s8, ctxt); break; case 2: modrm_ea += insn_fetch(u16, ctxt); break; } switch (ctxt->modrm_rm) { case 0: modrm_ea += bx + si; break; case 1: modrm_ea += bx + di; break; case 2: modrm_ea += bp + si; break; case 3: modrm_ea += bp + di; break; case 4: modrm_ea += si; break; case 5: modrm_ea += di; break; case 6: if (ctxt->modrm_mod != 0) modrm_ea += bp; break; case 7: modrm_ea += bx; break; } if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 || (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0)) ctxt->modrm_seg = VCPU_SREG_SS; modrm_ea = (u16)modrm_ea; } else { /* 32/64-bit ModR/M decode. */ if ((ctxt->modrm_rm & 7) == 4) { sib = insn_fetch(u8, ctxt); index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) modrm_ea += insn_fetch(s32, ctxt); else { modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); /* Increment ESP on POP [ESP] */ if ((ctxt->d & IncSP) && base_reg == VCPU_REGS_RSP) modrm_ea += ctxt->op_bytes; } if (index_reg != 4) modrm_ea += reg_read(ctxt, index_reg) << scale; } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { modrm_ea += insn_fetch(s32, ctxt); if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->rip_relative = 1; } else { base_reg = ctxt->modrm_rm; modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); } switch (ctxt->modrm_mod) { case 1: modrm_ea += insn_fetch(s8, ctxt); break; case 2: modrm_ea += insn_fetch(s32, ctxt); break; } } op->addr.mem.ea = modrm_ea; if (ctxt->ad_bytes != 8) ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; done: return rc; } static int decode_abs(struct x86_emulate_ctxt *ctxt, struct operand *op) { int rc = X86EMUL_CONTINUE; op->type = OP_MEM; switch (ctxt->ad_bytes) { case 2: op->addr.mem.ea = insn_fetch(u16, ctxt); break; case 4: op->addr.mem.ea = insn_fetch(u32, ctxt); break; case 8: op->addr.mem.ea = insn_fetch(u64, ctxt); break; } done: return rc; } static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) { long sv = 0, mask; if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) { mask = ~((long)ctxt->dst.bytes * 8 - 1); if (ctxt->src.bytes == 2) sv = (s16)ctxt->src.val & (s16)mask; else if (ctxt->src.bytes == 4) sv = (s32)ctxt->src.val & (s32)mask; else sv = (s64)ctxt->src.val & (s64)mask; ctxt->dst.addr.mem.ea = address_mask(ctxt, ctxt->dst.addr.mem.ea + (sv >> 3)); } /* only subword offset */ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; } static int read_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *dest, unsigned size) { int rc; struct read_cache *mc = &ctxt->mem_read; if (mc->pos < mc->end) goto read_cached; if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt)) return X86EMUL_UNHANDLEABLE; rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; mc->end += size; read_cached: memcpy(dest, mc->data + mc->pos, size); mc->pos += size; return X86EMUL_CONTINUE; } static int segmented_read(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, void *data, unsigned size) { int rc; ulong linear; rc = linearize(ctxt, addr, size, false, &linear); if (rc != X86EMUL_CONTINUE) return rc; return read_emulated(ctxt, linear, data, size); } static int segmented_write(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, const void *data, unsigned size) { int rc; ulong linear; rc = linearize(ctxt, addr, size, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; return ctxt->ops->write_emulated(ctxt, linear, data, size, &ctxt->exception); } static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, const void *orig_data, const void *data, unsigned size) { int rc; ulong linear; rc = linearize(ctxt, addr, size, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data, size, &ctxt->exception); } static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, unsigned int size, unsigned short port, void *dest) { struct read_cache *rc = &ctxt->io_read; if (rc->pos == rc->end) { /* refill pio read ahead */ unsigned int in_page, n; unsigned int count = ctxt->rep_prefix ? address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1; in_page = (ctxt->eflags & X86_EFLAGS_DF) ? offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count); if (n == 0) n = 1; rc->pos = rc->end = 0; if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n)) return 0; rc->end = n * size; } if (ctxt->rep_prefix && (ctxt->d & String) && !(ctxt->eflags & X86_EFLAGS_DF)) { ctxt->dst.data = rc->data + rc->pos; ctxt->dst.type = OP_MEM_STR; ctxt->dst.count = (rc->end - rc->pos) / size; rc->pos = rc->end; } else { memcpy(dest, rc->data + rc->pos, size); rc->pos += size; } return 1; } static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, u16 index, struct desc_struct *desc) { struct desc_ptr dt; ulong addr; ctxt->ops->get_idt(ctxt, &dt); if (dt.size < index * 8 + 7) return emulate_gp(ctxt, index << 3 | 0x2); addr = dt.address + index * 8; return linear_read_system(ctxt, addr, desc, sizeof(*desc)); } static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { const struct x86_emulate_ops *ops = ctxt->ops; u32 base3 = 0; if (selector & 1 << 2) { struct desc_struct desc; u16 sel; memset(dt, 0, sizeof(*dt)); if (!ops->get_segment(ctxt, &sel, &desc, &base3, VCPU_SREG_LDTR)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ dt->address = get_desc_base(&desc) | ((u64)base3 << 32); } else ops->get_gdt(ctxt, dt); } static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, ulong *desc_addr_p) { struct desc_ptr dt; u16 index = selector >> 3; ulong addr; get_descriptor_table_ptr(ctxt, selector, &dt); if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); addr = dt.address + index * 8; #ifdef CONFIG_X86_64 if (addr >> 32 != 0) { u64 efer = 0; ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (!(efer & EFER_LMA)) addr &= (u32)-1; } #endif *desc_addr_p = addr; return X86EMUL_CONTINUE; } /* allowed just for 8 bytes segments */ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc, ulong *desc_addr_p) { int rc; rc = get_descriptor_ptr(ctxt, selector, desc_addr_p); if (rc != X86EMUL_CONTINUE) return rc; return linear_read_system(ctxt, *desc_addr_p, desc, sizeof(*desc)); } /* allowed just for 8 bytes segments */ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc) { int rc; ulong addr; rc = get_descriptor_ptr(ctxt, selector, &addr); if (rc != X86EMUL_CONTINUE) return rc; return linear_write_system(ctxt, addr, desc, sizeof(*desc)); } static bool emulator_is_ssp_invalid(struct x86_emulate_ctxt *ctxt, u8 cpl) { const u32 MSR_IA32_X_CET = cpl == 3 ? MSR_IA32_U_CET : MSR_IA32_S_CET; u64 efer = 0, cet = 0, ssp = 0; if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET)) return false; if (ctxt->ops->get_msr(ctxt, MSR_EFER, &efer)) return true; /* SSP is guaranteed to be valid if the vCPU was already in 32-bit mode. */ if (!(efer & EFER_LMA)) return false; if (ctxt->ops->get_msr(ctxt, MSR_IA32_X_CET, &cet)) return true; if (!(cet & CET_SHSTK_EN)) return false; if (ctxt->ops->get_msr(ctxt, MSR_KVM_INTERNAL_GUEST_SSP, &ssp)) return true; /* * On transfer from 64-bit mode to compatibility mode, SSP[63:32] must * be 0, i.e. SSP must be a 32-bit value outside of 64-bit mode. */ return ssp >> 32; } static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, enum x86_transfer_type transfer, struct desc_struct *desc) { struct desc_struct seg_desc, old_desc; u8 dpl, rpl; unsigned err_vec = GP_VECTOR; u32 err_code = 0; bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ ulong desc_addr; int ret; u16 dummy; u32 base3 = 0; memset(&seg_desc, 0, sizeof(seg_desc)); if (ctxt->mode == X86EMUL_MODE_REAL) { /* set real mode segment descriptor (keep limit etc. for * unreal mode) */ ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg); set_desc_base(&seg_desc, selector << 4); goto load; } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) { /* VM86 needs a clean new segment descriptor */ set_desc_base(&seg_desc, selector << 4); set_desc_limit(&seg_desc, 0xffff); seg_desc.type = 3; seg_desc.p = 1; seg_desc.s = 1; seg_desc.dpl = 3; goto load; } rpl = selector & 3; /* TR should be in GDT only */ if (seg == VCPU_SREG_TR && (selector & (1 << 2))) goto exception; /* NULL selector is not valid for TR, CS and (except for long mode) SS */ if (null_selector) { if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR) goto exception; if (seg == VCPU_SREG_SS) { if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl) goto exception; /* * ctxt->ops->set_segment expects the CPL to be in * SS.DPL, so fake an expand-up 32-bit data segment. */ seg_desc.type = 3; seg_desc.p = 1; seg_desc.s = 1; seg_desc.dpl = cpl; seg_desc.d = 1; seg_desc.g = 1; } /* Skip all following checks */ goto load; } ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; err_code = selector & 0xfffc; err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR : GP_VECTOR; /* can't load system descriptor into segment selector */ if (seg <= VCPU_SREG_GS && !seg_desc.s) { if (transfer == X86_TRANSFER_CALL_JMP) return X86EMUL_UNHANDLEABLE; goto exception; } dpl = seg_desc.dpl; switch (seg) { case VCPU_SREG_SS: /* * segment is not a writable data segment or segment * selector's RPL != CPL or DPL != CPL */ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) goto exception; break; case VCPU_SREG_CS: /* * KVM uses "none" when loading CS as part of emulating Real * Mode exceptions and IRET (handled above). In all other * cases, loading CS without a control transfer is a KVM bug. */ if (WARN_ON_ONCE(transfer == X86_TRANSFER_NONE)) goto exception; if (!(seg_desc.type & 8)) goto exception; if (transfer == X86_TRANSFER_RET) { /* RET can never return to an inner privilege level. */ if (rpl < cpl) goto exception; /* Outer-privilege level return is not implemented */ if (rpl > cpl) return X86EMUL_UNHANDLEABLE; } if (transfer == X86_TRANSFER_RET || transfer == X86_TRANSFER_TASK_SWITCH) { if (seg_desc.type & 4) { /* conforming */ if (dpl > rpl) goto exception; } else { /* nonconforming */ if (dpl != rpl) goto exception; } } else { /* X86_TRANSFER_CALL_JMP */ if (seg_desc.type & 4) { /* conforming */ if (dpl > cpl) goto exception; } else { /* nonconforming */ if (rpl > cpl || dpl != cpl) goto exception; } } /* in long-mode d/b must be clear if l is set */ if (seg_desc.d && seg_desc.l) { u64 efer = 0; ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) goto exception; } if (!seg_desc.l && emulator_is_ssp_invalid(ctxt, cpl)) { err_code = 0; goto exception; } /* CS(RPL) <- CPL */ selector = (selector & 0xfffc) | cpl; break; case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) goto exception; break; default: /* DS, ES, FS, or GS */ /* * segment is not a data or readable code segment or * ((segment is a data or nonconforming code segment) * and ((RPL > DPL) or (CPL > DPL))) */ if ((seg_desc.type & 0xa) == 0x8 || (((seg_desc.type & 0xc) != 0xc) && (rpl > dpl || cpl > dpl))) goto exception; break; } if (!seg_desc.p) { err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; goto exception; } if (seg_desc.s) { /* mark segment as accessed */ if (!(seg_desc.type & 1)) { seg_desc.type |= 1; ret = write_segment_descriptor(ctxt, selector, &seg_desc); if (ret != X86EMUL_CONTINUE) return ret; } } else if (ctxt->mode == X86EMUL_MODE_PROT64) { ret = linear_read_system(ctxt, desc_addr+8, &base3, sizeof(base3)); if (ret != X86EMUL_CONTINUE) return ret; if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | ((u64)base3 << 32), ctxt, X86EMUL_F_DT_LOAD)) return emulate_gp(ctxt, err_code); } if (seg == VCPU_SREG_TR) { old_desc = seg_desc; seg_desc.type |= 2; /* busy */ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, sizeof(seg_desc), &ctxt->exception); if (ret != X86EMUL_CONTINUE) return ret; } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); if (desc) *desc = seg_desc; return X86EMUL_CONTINUE; exception: return emulate_exception(ctxt, err_vec, err_code, true); } static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); /* * None of MOV, POP and LSS can load a NULL selector in CPL=3, but * they can load it at CPL<3 (Intel's manual says only LSS can, * but it's wrong). * * However, the Intel manual says that putting IST=1/DPL=3 in * an interrupt gate will result in SS=3 (the AMD manual instead * says it doesn't), so allow SS=3 in __load_segment_descriptor * and only forbid it here. */ if (seg == VCPU_SREG_SS && selector == 3 && ctxt->mode == X86EMUL_MODE_PROT64) return emulate_exception(ctxt, GP_VECTOR, 0, true); return __load_segment_descriptor(ctxt, selector, seg, cpl, X86_TRANSFER_NONE, NULL); } static void write_register_operand(struct operand *op) { return assign_register(op->addr.reg, op->val, op->bytes); } static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) { switch (op->type) { case OP_REG: write_register_operand(op); break; case OP_MEM: if (ctxt->lock_prefix) return segmented_cmpxchg(ctxt, op->addr.mem, &op->orig_val, &op->val, op->bytes); else return segmented_write(ctxt, op->addr.mem, &op->val, op->bytes); case OP_MEM_STR: return segmented_write(ctxt, op->addr.mem, op->data, op->bytes * op->count); case OP_XMM: kvm_write_sse_reg(op->addr.xmm, &op->vec_val); break; case OP_MM: kvm_write_mmx_reg(op->addr.mm, &op->mm_val); break; case OP_NONE: /* no writeback */ break; default: break; } return X86EMUL_CONTINUE; } static int emulate_push(struct x86_emulate_ctxt *ctxt, const void *data, int len) { struct segmented_address addr; rsp_increment(ctxt, -len); addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; return segmented_write(ctxt, addr, data, len); } static int em_push(struct x86_emulate_ctxt *ctxt) { /* Disable writeback. */ ctxt->dst.type = OP_NONE; return emulate_push(ctxt, &ctxt->src.val, ctxt->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, void *dest, int len) { int rc; struct segmented_address addr; addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; rsp_increment(ctxt, len); return rc; } static int em_pop(struct x86_emulate_ctxt *ctxt) { return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); } static int emulate_popf(struct x86_emulate_ctxt *ctxt, void *dest, int len) { int rc; unsigned long val = 0; unsigned long change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; int cpl = ctxt->ops->cpl(ctxt); rc = emulate_pop(ctxt, &val, len); if (rc != X86EMUL_CONTINUE) return rc; change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF | X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT | X86_EFLAGS_AC | X86_EFLAGS_ID; switch(ctxt->mode) { case X86EMUL_MODE_PROT64: case X86EMUL_MODE_PROT32: case X86EMUL_MODE_PROT16: if (cpl == 0) change_mask |= X86_EFLAGS_IOPL; if (cpl <= iopl) change_mask |= X86_EFLAGS_IF; break; case X86EMUL_MODE_VM86: if (iopl < 3) return emulate_gp(ctxt, 0); change_mask |= X86_EFLAGS_IF; break; default: /* real mode */ change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF); break; } *(unsigned long *)dest = (ctxt->eflags & ~change_mask) | (val & change_mask); return rc; } static int em_popf(struct x86_emulate_ctxt *ctxt) { ctxt->dst.type = OP_REG; ctxt->dst.addr.reg = &ctxt->eflags; ctxt->dst.bytes = ctxt->op_bytes; return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); } static int em_enter(struct x86_emulate_ctxt *ctxt) { int rc; unsigned frame_size = ctxt->src.val; unsigned nesting_level = ctxt->src2.val & 31; ulong rbp; if (nesting_level) return X86EMUL_UNHANDLEABLE; rbp = reg_read(ctxt, VCPU_REGS_RBP); rc = emulate_push(ctxt, &rbp, stack_size(ctxt)); if (rc != X86EMUL_CONTINUE) return rc; assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP), stack_mask(ctxt)); assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RSP) - frame_size, stack_mask(ctxt)); return X86EMUL_CONTINUE; } static int em_leave(struct x86_emulate_ctxt *ctxt) { assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP), stack_mask(ctxt)); return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes); } static int em_push_sreg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; ctxt->src.val = get_segment_selector(ctxt, seg); if (ctxt->op_bytes == 4) { rsp_increment(ctxt, -2); ctxt->op_bytes = 2; } return em_push(ctxt); } static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; unsigned long selector = 0; int rc; rc = emulate_pop(ctxt, &selector, 2); if (rc != X86EMUL_CONTINUE) return rc; if (seg == VCPU_SREG_SS) ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; if (ctxt->op_bytes > 2) rsp_increment(ctxt, ctxt->op_bytes - 2); rc = load_segment_descriptor(ctxt, (u16)selector, seg); return rc; } static int em_pusha(struct x86_emulate_ctxt *ctxt) { unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP); int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RAX; while (reg <= VCPU_REGS_RDI) { (reg == VCPU_REGS_RSP) ? (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg)); rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ++reg; } return rc; } static int em_pushf(struct x86_emulate_ctxt *ctxt) { ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM; return em_push(ctxt); } static int em_popa(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RDI; u32 val = 0; while (reg >= VCPU_REGS_RAX) { if (reg == VCPU_REGS_RSP) { rsp_increment(ctxt, ctxt->op_bytes); --reg; } rc = emulate_pop(ctxt, &val, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) break; assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes); --reg; } return rc; } static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) { const struct x86_emulate_ops *ops = ctxt->ops; int rc; struct desc_ptr dt; gva_t cs_addr; gva_t eip_addr; u16 cs, eip; /* TODO: Add limit checks */ ctxt->src.val = ctxt->eflags; rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC); ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ctxt->src.val = ctxt->_eip; rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ops->get_idt(ctxt, &dt); eip_addr = dt.address + (irq << 2); cs_addr = dt.address + (irq << 2) + 2; rc = linear_read_system(ctxt, cs_addr, &cs, 2); if (rc != X86EMUL_CONTINUE) return rc; rc = linear_read_system(ctxt, eip_addr, &eip, 2); if (rc != X86EMUL_CONTINUE) return rc; rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS); if (rc != X86EMUL_CONTINUE) return rc; ctxt->_eip = eip; return rc; } int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) { int rc; invalidate_registers(ctxt); rc = __emulate_int_real(ctxt, irq); if (rc == X86EMUL_CONTINUE) writeback_registers(ctxt); return rc; } static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) { switch(ctxt->mode) { case X86EMUL_MODE_REAL: return __emulate_int_real(ctxt, irq); case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: case X86EMUL_MODE_PROT32: case X86EMUL_MODE_PROT64: default: /* Protected mode interrupts unimplemented yet */ return X86EMUL_UNHANDLEABLE; } } static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; unsigned long temp_eip = 0; unsigned long temp_eflags = 0; unsigned long cs = 0; unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF | X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF | X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF | X86_EFLAGS_AC | X86_EFLAGS_ID | X86_EFLAGS_FIXED; unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF | X86_EFLAGS_VIP; /* TODO: Add stack limit check */ rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; if (temp_eip & ~0xffff) return emulate_gp(ctxt, 0); rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); if (rc != X86EMUL_CONTINUE) return rc; ctxt->_eip = temp_eip; if (ctxt->op_bytes == 4) ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); else if (ctxt->op_bytes == 2) { ctxt->eflags &= ~0xffff; ctxt->eflags |= temp_eflags; } ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ ctxt->eflags |= X86_EFLAGS_FIXED; ctxt->ops->set_nmi_mask(ctxt, false); return rc; } static int em_iret(struct x86_emulate_ctxt *ctxt) { switch(ctxt->mode) { case X86EMUL_MODE_REAL: return emulate_iret_real(ctxt); case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: case X86EMUL_MODE_PROT32: case X86EMUL_MODE_PROT64: default: /* iret from protected mode unimplemented yet */ return X86EMUL_UNHANDLEABLE; } } static int em_jmp_far(struct x86_emulate_ctxt *ctxt) { int rc; unsigned short sel; struct desc_struct new_desc; u8 cpl = ctxt->ops->cpl(ctxt); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, X86_TRANSFER_CALL_JMP, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; rc = assign_eip_far(ctxt, ctxt->src.val); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; return rc; } static int em_jmp_abs(struct x86_emulate_ctxt *ctxt) { return assign_eip_near(ctxt, ctxt->src.val); } static int em_call_near_abs(struct x86_emulate_ctxt *ctxt) { int rc; long int old_eip; old_eip = ctxt->_eip; rc = assign_eip_near(ctxt, ctxt->src.val); if (rc != X86EMUL_CONTINUE) return rc; ctxt->src.val = old_eip; rc = em_push(ctxt); return rc; } static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) { u64 old = ctxt->dst.orig_val64; if (ctxt->dst.bytes == 16) return X86EMUL_UNHANDLEABLE; if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) || ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32); ctxt->eflags &= ~X86_EFLAGS_ZF; } else { ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) | (u32) reg_read(ctxt, VCPU_REGS_RBX); ctxt->eflags |= X86_EFLAGS_ZF; } return X86EMUL_CONTINUE; } static int em_ret(struct x86_emulate_ctxt *ctxt) { int rc; unsigned long eip = 0; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; return assign_eip_near(ctxt, eip); } static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; unsigned long eip = 0; unsigned long cs = 0; int cpl = ctxt->ops->cpl(ctxt); struct desc_struct new_desc; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, X86_TRANSFER_RET, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; rc = assign_eip_far(ctxt, eip); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; return rc; } static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt) { int rc; rc = em_ret_far(ctxt); if (rc != X86EMUL_CONTINUE) return rc; rsp_increment(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; } static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) { /* Save real source value, then compare EAX against destination. */ ctxt->dst.orig_val = ctxt->dst.val; ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX); ctxt->src.orig_val = ctxt->src.val; ctxt->src.val = ctxt->dst.orig_val; em_cmp(ctxt); if (ctxt->eflags & X86_EFLAGS_ZF) { /* Success: write back to memory; no update of EAX */ ctxt->src.type = OP_NONE; ctxt->dst.val = ctxt->src.orig_val; } else { /* Failure: write the value we saw to EAX. */ ctxt->src.type = OP_REG; ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); ctxt->src.val = ctxt->dst.orig_val; /* Create write-cycle to dest by writing the same value */ ctxt->dst.val = ctxt->dst.orig_val; } return X86EMUL_CONTINUE; } static int em_lseg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; unsigned short sel; int rc; memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); rc = load_segment_descriptor(ctxt, sel, seg); if (rc != X86EMUL_CONTINUE) return rc; ctxt->dst.val = ctxt->src.val; return rc; } static int em_rsm(struct x86_emulate_ctxt *ctxt) { if (!ctxt->ops->is_smm(ctxt)) return emulate_ud(ctxt); if (ctxt->ops->leave_smm(ctxt)) ctxt->ops->triple_fault(ctxt); return emulator_recalc_and_set_mode(ctxt); } static void setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss) { cs->l = 0; /* will be adjusted later */ set_desc_base(cs, 0); /* flat segment */ cs->g = 1; /* 4kb granularity */ set_desc_limit(cs, 0xfffff); /* 4GB limit */ cs->type = 0x0b; /* Read, Execute, Accessed */ cs->s = 1; cs->dpl = 0; /* will be adjusted later */ cs->p = 1; cs->d = 1; cs->avl = 0; set_desc_base(ss, 0); /* flat segment */ set_desc_limit(ss, 0xfffff); /* 4GB limit */ ss->g = 1; /* 4kb granularity */ ss->s = 1; ss->type = 0x03; /* Read/Write, Accessed */ ss->d = 1; /* 32bit stack segment */ ss->dpl = 0; ss->p = 1; ss->l = 0; ss->avl = 0; } static int em_syscall(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; u64 efer = 0; /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) return emulate_ud(ctxt); /* * Intel compatible CPUs only support SYSCALL in 64-bit mode, whereas * AMD allows SYSCALL in any flavor of protected mode. Note, it's * infeasible to emulate Intel behavior when running on AMD hardware, * as SYSCALL won't fault in the "wrong" mode, i.e. there is no #UD * for KVM to trap-and-emulate, unlike emulating AMD on Intel. */ if (ctxt->mode != X86EMUL_MODE_PROT64 && ctxt->ops->guest_cpuid_is_intel_compatible(ctxt)) return emulate_ud(ctxt); ops->get_msr(ctxt, MSR_EFER, &efer); if (!(efer & EFER_SCE)) return emulate_ud(ctxt); setup_syscalls_segments(&cs, &ss); ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); ss_sel = (u16)(msr_data + 8); if (efer & EFER_LMA) { cs.d = 0; cs.l = 1; } ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip; if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags; ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? MSR_LSTAR : MSR_CSTAR, &msr_data); ctxt->_eip = msr_data; ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~msr_data; ctxt->eflags |= X86_EFLAGS_FIXED; #endif } else { /* legacy mode */ ops->get_msr(ctxt, MSR_STAR, &msr_data); ctxt->_eip = (u32)msr_data; ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); } ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; return X86EMUL_CONTINUE; } static int em_sysenter(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; u64 efer = 0; ops->get_msr(ctxt, MSR_EFER, &efer); /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) return emulate_gp(ctxt, 0); /* * Intel's architecture allows SYSENTER in compatibility mode, but AMD * does not. Note, AMD does allow SYSENTER in legacy protected mode. */ if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) && !ctxt->ops->guest_cpuid_is_intel_compatible(ctxt)) return emulate_ud(ctxt); /* sysenter/sysexit have not been tested in 64bit mode. */ if (ctxt->mode == X86EMUL_MODE_PROT64) return X86EMUL_UNHANDLEABLE; ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); if ((msr_data & 0xfffc) == 0x0) return emulate_gp(ctxt, 0); setup_syscalls_segments(&cs, &ss); ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK; ss_sel = cs_sel + 8; if (efer & EFER_LMA) { cs.d = 0; cs.l = 1; } ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); ctxt->_eip = (efer & EFER_LMA) ? msr_data : (u32)msr_data; ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data : (u32)msr_data; if (efer & EFER_LMA) ctxt->mode = X86EMUL_MODE_PROT64; return X86EMUL_CONTINUE; } static int em_sysexit(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data, rcx, rdx; int usermode; u16 cs_sel = 0, ss_sel = 0; /* inject #GP if in real mode or Virtual 8086 mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) return emulate_gp(ctxt, 0); setup_syscalls_segments(&cs, &ss); if ((ctxt->rex_prefix & 0x8) != 0x0) usermode = X86EMUL_MODE_PROT64; else usermode = X86EMUL_MODE_PROT32; rcx = reg_read(ctxt, VCPU_REGS_RCX); rdx = reg_read(ctxt, VCPU_REGS_RDX); cs.dpl = 3; ss.dpl = 3; ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); switch (usermode) { case X86EMUL_MODE_PROT32: cs_sel = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) return emulate_gp(ctxt, 0); ss_sel = (u16)(msr_data + 24); rcx = (u32)rcx; rdx = (u32)rdx; break; case X86EMUL_MODE_PROT64: cs_sel = (u16)(msr_data + 32); if (msr_data == 0x0) return emulate_gp(ctxt, 0); ss_sel = cs_sel + 8; cs.d = 0; cs.l = 1; if (emul_is_noncanonical_address(rcx, ctxt, 0) || emul_is_noncanonical_address(rdx, ctxt, 0)) return emulate_gp(ctxt, 0); break; } cs_sel |= SEGMENT_RPL_MASK; ss_sel |= SEGMENT_RPL_MASK; ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); ctxt->_eip = rdx; ctxt->mode = usermode; *reg_write(ctxt, VCPU_REGS_RSP) = rcx; return X86EMUL_CONTINUE; } static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) { int iopl; if (ctxt->mode == X86EMUL_MODE_REAL) return false; if (ctxt->mode == X86EMUL_MODE_VM86) return true; iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; return ctxt->ops->cpl(ctxt) > iopl; } #define VMWARE_PORT_VMPORT (0x5658) #define VMWARE_PORT_VMRPC (0x5659) static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, u16 port, u16 len) { const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct tr_seg; u32 base3; int r; u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; unsigned long base; /* * VMware allows access to these ports even if denied * by TSS I/O permission bitmap. Mimic behavior. */ if (enable_vmware_backdoor && ((port == VMWARE_PORT_VMPORT) || (port == VMWARE_PORT_VMRPC))) return true; ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR); if (!tr_seg.p) return false; if (desc_limit_scaled(&tr_seg) < 103) return false; base = get_desc_base(&tr_seg); #ifdef CONFIG_X86_64 base |= ((u64)base3) << 32; #endif r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL, true); if (r != X86EMUL_CONTINUE) return false; if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL, true); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) return false; return true; } static bool emulator_io_permitted(struct x86_emulate_ctxt *ctxt, u16 port, u16 len) { if (ctxt->perm_ok) return true; if (emulator_bad_iopl(ctxt)) if (!emulator_io_port_access_allowed(ctxt, port, len)) return false; ctxt->perm_ok = true; return true; } static void string_registers_quirk(struct x86_emulate_ctxt *ctxt) { /* * Intel CPUs mask the counter and pointers in quite strange * manner when ECX is zero due to REP-string optimizations. */ #ifdef CONFIG_X86_64 u32 eax, ebx, ecx, edx; if (ctxt->ad_bytes != 4) return; eax = ecx = 0; ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); if (!is_guest_vendor_intel(ebx, ecx, edx)) return; *reg_write(ctxt, VCPU_REGS_RCX) = 0; switch (ctxt->b) { case 0xa4: /* movsb */ case 0xa5: /* movsd/w */ *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1; fallthrough; case 0xaa: /* stosb */ case 0xab: /* stosd/w */ *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1; } #endif } static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, struct tss_segment_16 *tss) { tss->ip = ctxt->_eip; tss->flag = ctxt->eflags; tss->ax = reg_read(ctxt, VCPU_REGS_RAX); tss->cx = reg_read(ctxt, VCPU_REGS_RCX); tss->dx = reg_read(ctxt, VCPU_REGS_RDX); tss->bx = reg_read(ctxt, VCPU_REGS_RBX); tss->sp = reg_read(ctxt, VCPU_REGS_RSP); tss->bp = reg_read(ctxt, VCPU_REGS_RBP); tss->si = reg_read(ctxt, VCPU_REGS_RSI); tss->di = reg_read(ctxt, VCPU_REGS_RDI); tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, struct tss_segment_16 *tss) { int ret; u8 cpl; ctxt->_eip = tss->ip; ctxt->eflags = tss->flag | 2; *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax; *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx; *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx; *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx; *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp; *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp; *reg_write(ctxt, VCPU_REGS_RSI) = tss->si; *reg_write(ctxt, VCPU_REGS_RDI) = tss->di; /* * SDM says that segment selectors are loaded before segment * descriptors */ set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR); set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); cpl = tss->cs & 3; /* * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; return X86EMUL_CONTINUE; } static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { struct tss_segment_16 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg)); if (ret != X86EMUL_CONTINUE) return ret; save_state_to_tss16(ctxt, &tss_seg); ret = linear_write_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg)); if (ret != X86EMUL_CONTINUE) return ret; ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof(tss_seg)); if (ret != X86EMUL_CONTINUE) return ret; if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; ret = linear_write_system(ctxt, new_tss_base, &tss_seg.prev_task_link, sizeof(tss_seg.prev_task_link)); if (ret != X86EMUL_CONTINUE) return ret; } return load_state_from_tss16(ctxt, &tss_seg); } static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 *tss) { /* CR3 and ldt selector are not saved intentionally */ tss->eip = ctxt->_eip; tss->eflags = ctxt->eflags; tss->eax = reg_read(ctxt, VCPU_REGS_RAX); tss->ecx = reg_read(ctxt, VCPU_REGS_RCX); tss->edx = reg_read(ctxt, VCPU_REGS_RDX); tss->ebx = reg_read(ctxt, VCPU_REGS_RBX); tss->esp = reg_read(ctxt, VCPU_REGS_RSP); tss->ebp = reg_read(ctxt, VCPU_REGS_RBP); tss->esi = reg_read(ctxt, VCPU_REGS_RSI); tss->edi = reg_read(ctxt, VCPU_REGS_RDI); tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); } static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 *tss) { int ret; u8 cpl; if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); ctxt->_eip = tss->eip; ctxt->eflags = tss->eflags | 2; /* General purpose registers */ *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax; *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx; *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx; *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx; *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp; *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp; *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi; *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi; /* * SDM says that segment selectors are loaded before segment * descriptors. This is important because CPL checks will * use CS.RPL. */ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); /* * If we're switching between Protected Mode and VM86, we need to make * sure to update the mode before loading the segment descriptors so * that the selectors are interpreted correctly. */ if (ctxt->eflags & X86_EFLAGS_VM) { ctxt->mode = X86EMUL_MODE_VM86; cpl = 3; } else { ctxt->mode = X86EMUL_MODE_PROT32; cpl = tss->cs & 3; } /* * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, X86_TRANSFER_TASK_SWITCH, NULL); return ret; } static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { struct tss_segment_32 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); u32 eip_offset = offsetof(struct tss_segment_32, eip); u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector); ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg)); if (ret != X86EMUL_CONTINUE) return ret; save_state_to_tss32(ctxt, &tss_seg); /* Only GP registers and segment selectors are saved */ ret = linear_write_system(ctxt, old_tss_base + eip_offset, &tss_seg.eip, ldt_sel_offset - eip_offset); if (ret != X86EMUL_CONTINUE) return ret; ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof(tss_seg)); if (ret != X86EMUL_CONTINUE) return ret; if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; ret = linear_write_system(ctxt, new_tss_base, &tss_seg.prev_task_link, sizeof(tss_seg.prev_task_link)); if (ret != X86EMUL_CONTINUE) return ret; } return load_state_from_tss32(ctxt, &tss_seg); } static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct curr_tss_desc, next_tss_desc; int ret; u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); ulong old_tss_base = ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; ulong desc_addr, dr7; /* FIXME: old_tss_base == ~0 ? */ ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; /* FIXME: check that next_tss_desc is tss */ /* * Check privileges. The three cases are task switch caused by... * * 1. jmp/call/int to task gate: Check against DPL of the task gate * 2. Exception/IRQ/iret: No check is performed * 3. jmp/call to TSS/task-gate: No check is performed since the * hardware checks it before exiting. */ if (reason == TASK_SWITCH_GATE) { if (idt_index != -1) { /* Software interrupts */ struct desc_struct task_gate_desc; int dpl; ret = read_interrupt_descriptor(ctxt, idt_index, &task_gate_desc); if (ret != X86EMUL_CONTINUE) return ret; dpl = task_gate_desc.dpl; if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) return emulate_gp(ctxt, (idt_index << 3) | 0x2); } } desc_limit = desc_limit_scaled(&next_tss_desc); if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || desc_limit < 0x2b)) { return emulate_ts(ctxt, tss_selector & 0xfffc); } if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); } if (reason == TASK_SWITCH_IRET) ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; /* set back link to prev task only if NT bit is set in eflags note that old_tss_sel is not used after this point */ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) old_tss_sel = 0xffff; if (next_tss_desc.type & 8) ret = task_switch_32(ctxt, old_tss_sel, old_tss_base, &next_tss_desc); else ret = task_switch_16(ctxt, old_tss_sel, old_tss_base, &next_tss_desc); if (ret != X86EMUL_CONTINUE) return ret; if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT; if (reason != TASK_SWITCH_IRET) { next_tss_desc.type |= (1 << 1); /* set busy flag */ write_segment_descriptor(ctxt, tss_selector, &next_tss_desc); } ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); if (has_error_code) { ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; ctxt->lock_prefix = 0; ctxt->src.val = (unsigned long) error_code; ret = em_push(ctxt); } dr7 = ops->get_dr(ctxt, 7); ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN)); return ret; } int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { int rc; invalidate_registers(ctxt); ctxt->_eip = ctxt->eip; ctxt->dst.type = OP_NONE; rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) { ctxt->eip = ctxt->_eip; writeback_registers(ctxt); } return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, struct operand *op) { int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count; register_address_increment(ctxt, reg, df * op->bytes); op->addr.mem.ea = register_address(ctxt, reg); } static int em_das(struct x86_emulate_ctxt *ctxt) { u8 al, old_al; bool af, cf, old_cf; cf = ctxt->eflags & X86_EFLAGS_CF; al = ctxt->dst.val; old_al = al; old_cf = cf; cf = false; af = ctxt->eflags & X86_EFLAGS_AF; if ((al & 0x0f) > 9 || af) { al -= 6; cf = old_cf | (al >= 250); af = true; } else { af = false; } if (old_al > 0x99 || old_cf) { al -= 0x60; cf = true; } ctxt->dst.val = al; /* Set PF, ZF, SF */ ctxt->src.type = OP_IMM; ctxt->src.val = 0; ctxt->src.bytes = 1; em_or(ctxt); ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); if (cf) ctxt->eflags |= X86_EFLAGS_CF; if (af) ctxt->eflags |= X86_EFLAGS_AF; return X86EMUL_CONTINUE; } static int em_aam(struct x86_emulate_ctxt *ctxt) { u8 al, ah; if (ctxt->src.val == 0) return emulate_de(ctxt); al = ctxt->dst.val & 0xff; ah = al / ctxt->src.val; al %= ctxt->src.val; ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al | (ah << 8); /* Set PF, ZF, SF */ ctxt->src.type = OP_IMM; ctxt->src.val = 0; ctxt->src.bytes = 1; em_or(ctxt); return X86EMUL_CONTINUE; } static int em_aad(struct x86_emulate_ctxt *ctxt) { u8 al = ctxt->dst.val & 0xff; u8 ah = (ctxt->dst.val >> 8) & 0xff; al = (al + (ah * ctxt->src.val)) & 0xff; ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al; /* Set PF, ZF, SF */ ctxt->src.type = OP_IMM; ctxt->src.val = 0; ctxt->src.bytes = 1; em_or(ctxt); return X86EMUL_CONTINUE; } static int em_call(struct x86_emulate_ctxt *ctxt) { int rc; long rel = ctxt->src.val; ctxt->src.val = (unsigned long)ctxt->_eip; rc = jmp_rel(ctxt, rel); if (rc != X86EMUL_CONTINUE) return rc; return em_push(ctxt); } static int em_call_far(struct x86_emulate_ctxt *ctxt) { u16 sel, old_cs; ulong old_eip; int rc; struct desc_struct old_desc, new_desc; const struct x86_emulate_ops *ops = ctxt->ops; int cpl = ctxt->ops->cpl(ctxt); enum x86emul_mode prev_mode = ctxt->mode; old_eip = ctxt->_eip; ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, X86_TRANSFER_CALL_JMP, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; rc = assign_eip_far(ctxt, ctxt->src.val); if (rc != X86EMUL_CONTINUE) goto fail; ctxt->src.val = old_cs; rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) goto fail; ctxt->src.val = old_eip; rc = em_push(ctxt); /* If we failed, we tainted the memory, but the very least we should restore cs */ if (rc != X86EMUL_CONTINUE) { pr_warn_once("faulting far call emulation tainted memory\n"); goto fail; } return rc; fail: ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); ctxt->mode = prev_mode; return rc; } static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) { int rc; unsigned long eip = 0; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; rc = assign_eip_near(ctxt, eip); if (rc != X86EMUL_CONTINUE) return rc; rsp_increment(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; } static int em_xchg(struct x86_emulate_ctxt *ctxt) { /* Write back the register source. */ ctxt->src.val = ctxt->dst.val; write_register_operand(&ctxt->src); /* Write back the memory destination with implicit LOCK prefix. */ ctxt->dst.val = ctxt->src.orig_val; ctxt->lock_prefix = 1; return X86EMUL_CONTINUE; } static int em_imul_3op(struct x86_emulate_ctxt *ctxt) { ctxt->dst.val = ctxt->src2.val; return em_imul(ctxt); } static int em_cwd(struct x86_emulate_ctxt *ctxt) { ctxt->dst.type = OP_REG; ctxt->dst.bytes = ctxt->src.bytes; ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); return X86EMUL_CONTINUE; } static int em_rdpid(struct x86_emulate_ctxt *ctxt) { u64 tsc_aux = 0; if (!ctxt->ops->guest_has_rdpid(ctxt)) return emulate_ud(ctxt); ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux); ctxt->dst.val = tsc_aux; return X86EMUL_CONTINUE; } static int em_rdtsc(struct x86_emulate_ctxt *ctxt) { u64 tsc = 0; ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc; *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32; return X86EMUL_CONTINUE; } static int em_rdpmc(struct x86_emulate_ctxt *ctxt) { u64 pmc; if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc)) return emulate_gp(ctxt, 0); *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc; *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32; return X86EMUL_CONTINUE; } static int em_mov(struct x86_emulate_ctxt *ctxt) { memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr)); return X86EMUL_CONTINUE; } static int em_movbe(struct x86_emulate_ctxt *ctxt) { u16 tmp; if (!ctxt->ops->guest_has_movbe(ctxt)) return emulate_ud(ctxt); switch (ctxt->op_bytes) { case 2: /* * From MOVBE definition: "...When the operand size is 16 bits, * the upper word of the destination register remains unchanged * ..." * * Both casting ->valptr and ->val to u16 breaks strict aliasing * rules so we have to do the operation almost per hand. */ tmp = (u16)ctxt->src.val; ctxt->dst.val &= ~0xffffUL; ctxt->dst.val |= (unsigned long)swab16(tmp); break; case 4: ctxt->dst.val = swab32((u32)ctxt->src.val); break; case 8: ctxt->dst.val = swab64(ctxt->src.val); break; default: BUG(); } return X86EMUL_CONTINUE; } static int em_cr_write(struct x86_emulate_ctxt *ctxt) { int cr_num = ctxt->modrm_reg; int r; if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val)) return emulate_gp(ctxt, 0); /* Disable writeback. */ ctxt->dst.type = OP_NONE; if (cr_num == 0) { /* * CR0 write might have updated CR0.PE and/or CR0.PG * which can affect the cpu's execution mode. */ r = emulator_recalc_and_set_mode(ctxt); if (r != X86EMUL_CONTINUE) return r; } return X86EMUL_CONTINUE; } static int em_dr_write(struct x86_emulate_ctxt *ctxt) { unsigned long val; if (ctxt->mode == X86EMUL_MODE_PROT64) val = ctxt->src.val & ~0ULL; else val = ctxt->src.val & ~0U; /* #UD condition is already handled. */ if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0) return emulate_gp(ctxt, 0); /* Disable writeback. */ ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } static int em_wrmsr(struct x86_emulate_ctxt *ctxt) { u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; int r; msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data); if (r == X86EMUL_PROPAGATE_FAULT) return emulate_gp(ctxt, 0); return r; } static int em_rdmsr(struct x86_emulate_ctxt *ctxt) { u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; int r; r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data); if (r == X86EMUL_PROPAGATE_FAULT) return emulate_gp(ctxt, 0); if (r == X86EMUL_CONTINUE) { *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data; *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32; } return r; } static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment) { if (segment > VCPU_SREG_GS && (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) && ctxt->ops->cpl(ctxt) > 0) return emulate_gp(ctxt, 0); ctxt->dst.val = get_segment_selector(ctxt, segment); if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM) ctxt->dst.bytes = 2; return X86EMUL_CONTINUE; } static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) { if (ctxt->modrm_reg > VCPU_SREG_GS) return emulate_ud(ctxt); return em_store_sreg(ctxt, ctxt->modrm_reg); } static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) { u16 sel = ctxt->src.val; if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS) return emulate_ud(ctxt); if (ctxt->modrm_reg == VCPU_SREG_SS) ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; /* Disable writeback. */ ctxt->dst.type = OP_NONE; return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); } static int em_sldt(struct x86_emulate_ctxt *ctxt) { return em_store_sreg(ctxt, VCPU_SREG_LDTR); } static int em_lldt(struct x86_emulate_ctxt *ctxt) { u16 sel = ctxt->src.val; /* Disable writeback. */ ctxt->dst.type = OP_NONE; return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); } static int em_str(struct x86_emulate_ctxt *ctxt) { return em_store_sreg(ctxt, VCPU_SREG_TR); } static int em_ltr(struct x86_emulate_ctxt *ctxt) { u16 sel = ctxt->src.val; /* Disable writeback. */ ctxt->dst.type = OP_NONE; return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR); } static int em_invlpg(struct x86_emulate_ctxt *ctxt) { int rc; ulong linear; unsigned int max_size; rc = __linearize(ctxt, ctxt->src.addr.mem, &max_size, 1, ctxt->mode, &linear, X86EMUL_F_INVLPG); if (rc == X86EMUL_CONTINUE) ctxt->ops->invlpg(ctxt, linear); /* Disable writeback. */ ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } static int em_clts(struct x86_emulate_ctxt *ctxt) { ulong cr0; cr0 = ctxt->ops->get_cr(ctxt, 0); cr0 &= ~X86_CR0_TS; ctxt->ops->set_cr(ctxt, 0, cr0); return X86EMUL_CONTINUE; } static int em_hypercall(struct x86_emulate_ctxt *ctxt) { int rc = ctxt->ops->fix_hypercall(ctxt); if (rc != X86EMUL_CONTINUE) return rc; /* Let the processor re-execute the fixed hypercall */ ctxt->_eip = ctxt->eip; /* Disable writeback. */ ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, void (*get)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *ptr)) { struct desc_ptr desc_ptr; if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) && ctxt->ops->cpl(ctxt) > 0) return emulate_gp(ctxt, 0); if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->op_bytes = 8; get(ctxt, &desc_ptr); if (ctxt->op_bytes == 2) { ctxt->op_bytes = 4; desc_ptr.address &= 0x00ffffff; } /* Disable writeback. */ ctxt->dst.type = OP_NONE; return segmented_write_std(ctxt, ctxt->dst.addr.mem, &desc_ptr, 2 + ctxt->op_bytes); } static int em_sgdt(struct x86_emulate_ctxt *ctxt) { return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt); } static int em_sidt(struct x86_emulate_ctxt *ctxt) { return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); } static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt) { struct desc_ptr desc_ptr; int rc; if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->op_bytes = 8; rc = read_descriptor(ctxt, ctxt->src.addr.mem, &desc_ptr.size, &desc_ptr.address, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; if (ctxt->mode == X86EMUL_MODE_PROT64 && emul_is_noncanonical_address(desc_ptr.address, ctxt, X86EMUL_F_DT_LOAD)) return emulate_gp(ctxt, 0); if (lgdt) ctxt->ops->set_gdt(ctxt, &desc_ptr); else ctxt->ops->set_idt(ctxt, &desc_ptr); /* Disable writeback. */ ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } static int em_lgdt(struct x86_emulate_ctxt *ctxt) { return em_lgdt_lidt(ctxt, true); } static int em_lidt(struct x86_emulate_ctxt *ctxt) { return em_lgdt_lidt(ctxt, false); } static int em_smsw(struct x86_emulate_ctxt *ctxt) { if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) && ctxt->ops->cpl(ctxt) > 0) return emulate_gp(ctxt, 0); if (ctxt->dst.type == OP_MEM) ctxt->dst.bytes = 2; ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0); return X86EMUL_CONTINUE; } static int em_lmsw(struct x86_emulate_ctxt *ctxt) { ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) | (ctxt->src.val & 0x0f)); ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } static int em_loop(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; register_address_increment(ctxt, VCPU_REGS_RCX, -1); if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) && (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) rc = jmp_rel(ctxt, ctxt->src.val); return rc; } static int em_jcxz(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) rc = jmp_rel(ctxt, ctxt->src.val); return rc; } static int em_in(struct x86_emulate_ctxt *ctxt) { if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val, &ctxt->dst.val)) return X86EMUL_IO_NEEDED; return X86EMUL_CONTINUE; } static int em_out(struct x86_emulate_ctxt *ctxt) { ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val, &ctxt->src.val, 1); /* Disable writeback. */ ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } static int em_cli(struct x86_emulate_ctxt *ctxt) { if (emulator_bad_iopl(ctxt)) return emulate_gp(ctxt, 0); ctxt->eflags &= ~X86_EFLAGS_IF; return X86EMUL_CONTINUE; } static int em_sti(struct x86_emulate_ctxt *ctxt) { if (emulator_bad_iopl(ctxt)) return emulate_gp(ctxt, 0); ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; return X86EMUL_CONTINUE; } static int em_cpuid(struct x86_emulate_ctxt *ctxt) { u32 eax, ebx, ecx, edx; u64 msr = 0; ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr); if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && ctxt->ops->cpl(ctxt)) { return emulate_gp(ctxt, 0); } eax = reg_read(ctxt, VCPU_REGS_RAX); ecx = reg_read(ctxt, VCPU_REGS_RCX); ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); *reg_write(ctxt, VCPU_REGS_RAX) = eax; *reg_write(ctxt, VCPU_REGS_RBX) = ebx; *reg_write(ctxt, VCPU_REGS_RCX) = ecx; *reg_write(ctxt, VCPU_REGS_RDX) = edx; return X86EMUL_CONTINUE; } static int em_sahf(struct x86_emulate_ctxt *ctxt) { u32 flags; flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF; flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8; ctxt->eflags &= ~0xffUL; ctxt->eflags |= flags | X86_EFLAGS_FIXED; return X86EMUL_CONTINUE; } static int em_lahf(struct x86_emulate_ctxt *ctxt) { *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL; *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8; return X86EMUL_CONTINUE; } static int em_bswap(struct x86_emulate_ctxt *ctxt) { switch (ctxt->op_bytes) { #ifdef CONFIG_X86_64 case 8: asm("bswap %0" : "+r"(ctxt->dst.val)); break; #endif default: asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); break; } return X86EMUL_CONTINUE; } static int em_clflush(struct x86_emulate_ctxt *ctxt) { /* emulating clflush regardless of cpuid */ return X86EMUL_CONTINUE; } static int em_clflushopt(struct x86_emulate_ctxt *ctxt) { /* emulating clflushopt regardless of cpuid */ return X86EMUL_CONTINUE; } static int em_movsxd(struct x86_emulate_ctxt *ctxt) { ctxt->dst.val = (s32) ctxt->src.val; return X86EMUL_CONTINUE; } static int check_fxsr(struct x86_emulate_ctxt *ctxt) { if (!ctxt->ops->guest_has_fxsr(ctxt)) return emulate_ud(ctxt); if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); /* * Don't emulate a case that should never be hit, instead of working * around a lack of fxsave64/fxrstor64 on old compilers. */ if (ctxt->mode >= X86EMUL_MODE_PROT64) return X86EMUL_UNHANDLEABLE; return X86EMUL_CONTINUE; } /* * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save * and restore MXCSR. */ static size_t __fxstate_size(int nregs) { return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16; } static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt) { bool cr4_osfxsr; if (ctxt->mode == X86EMUL_MODE_PROT64) return __fxstate_size(16); cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR; return __fxstate_size(cr4_osfxsr ? 8 : 0); } /* * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, * 1) 16 bit mode * 2) 32 bit mode * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs * preserve whole 32 bit values, though, so (1) and (2) are the same wrt. * save and restore * 3) 64-bit mode with REX.W prefix * - like (2), but XMM 8-15 are being saved and restored * 4) 64-bit mode without REX.W prefix * - like (3), but FIP and FDP are 64 bit * * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the * desired result. (4) is not emulated. * * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS * and FPU DS) should match. */ static int em_fxsave(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; int rc; rc = check_fxsr(ctxt); if (rc != X86EMUL_CONTINUE) return rc; kvm_fpu_get(); rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); kvm_fpu_put(); if (rc != X86EMUL_CONTINUE) return rc; return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, fxstate_size(ctxt)); } /* * FXRSTOR might restore XMM registers not provided by the guest. Fill * in the host registers (via FXSAVE) instead, so they won't be modified. * (preemption has to stay disabled until FXRSTOR). * * Use noinline to keep the stack for other functions called by callers small. */ static noinline int fxregs_fixup(struct fxregs_state *fx_state, const size_t used_size) { struct fxregs_state fx_tmp; int rc; rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_tmp)); memcpy((void *)fx_state + used_size, (void *)&fx_tmp + used_size, __fxstate_size(16) - used_size); return rc; } static int em_fxrstor(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; int rc; size_t size; rc = check_fxsr(ctxt); if (rc != X86EMUL_CONTINUE) return rc; size = fxstate_size(ctxt); rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); if (rc != X86EMUL_CONTINUE) return rc; kvm_fpu_get(); if (size < __fxstate_size(16)) { rc = fxregs_fixup(&fx_state, size); if (rc != X86EMUL_CONTINUE) goto out; } if (fx_state.mxcsr >> 16) { rc = emulate_gp(ctxt, 0); goto out; } if (rc == X86EMUL_CONTINUE) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); out: kvm_fpu_put(); return rc; } static int em_xsetbv(struct x86_emulate_ctxt *ctxt) { u32 eax, ecx, edx; if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE)) return emulate_ud(ctxt); eax = reg_read(ctxt, VCPU_REGS_RAX); edx = reg_read(ctxt, VCPU_REGS_RDX); ecx = reg_read(ctxt, VCPU_REGS_RCX); if (ctxt->ops->set_xcr(ctxt, ecx, ((u64)edx << 32) | eax)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; } static bool valid_cr(int nr) { switch (nr) { case 0: case 2 ... 4: case 8: return true; default: return false; } } static int check_cr_access(struct x86_emulate_ctxt *ctxt) { if (!valid_cr(ctxt->modrm_reg)) return emulate_ud(ctxt); return X86EMUL_CONTINUE; } static int check_dr_read(struct x86_emulate_ctxt *ctxt) { int dr = ctxt->modrm_reg; u64 cr4; if (dr > 7) return emulate_ud(ctxt); cr4 = ctxt->ops->get_cr(ctxt, 4); if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) return emulate_ud(ctxt); if (ctxt->ops->get_dr(ctxt, 7) & DR7_GD) { ulong dr6; dr6 = ctxt->ops->get_dr(ctxt, 6); dr6 &= ~DR_TRAP_BITS; dr6 |= DR6_BD | DR6_ACTIVE_LOW; ctxt->ops->set_dr(ctxt, 6, dr6); return emulate_db(ctxt); } return X86EMUL_CONTINUE; } static int check_dr_write(struct x86_emulate_ctxt *ctxt) { u64 new_val = ctxt->src.val64; int dr = ctxt->modrm_reg; if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) return emulate_gp(ctxt, 0); return check_dr_read(ctxt); } static int check_svme(struct x86_emulate_ctxt *ctxt) { u64 efer = 0; ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (!(efer & EFER_SVME)) return emulate_ud(ctxt); return X86EMUL_CONTINUE; } static int check_svme_pa(struct x86_emulate_ctxt *ctxt) { u64 rax = reg_read(ctxt, VCPU_REGS_RAX); /* Valid physical address? */ if (rax & 0xffff000000000000ULL) return emulate_gp(ctxt, 0); return check_svme(ctxt); } static int check_rdtsc(struct x86_emulate_ctxt *ctxt) { u64 cr4 = ctxt->ops->get_cr(ctxt, 4); if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; } static int check_rdpmc(struct x86_emulate_ctxt *ctxt) { u64 cr4 = ctxt->ops->get_cr(ctxt, 4); u64 rcx = reg_read(ctxt, VCPU_REGS_RCX); /* * VMware allows access to these Pseduo-PMCs even when read via RDPMC * in Ring3 when CR4.PCE=0. */ if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx)) return X86EMUL_CONTINUE; /* * If CR4.PCE is set, the SDM requires CPL=0 or CR0.PE=0. The CR0.PE * check however is unnecessary because CPL is always 0 outside * protected mode. */ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || ctxt->ops->check_rdpmc_early(ctxt, rcx)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; } static int check_perm_in(struct x86_emulate_ctxt *ctxt) { ctxt->dst.bytes = min(ctxt->dst.bytes, 4u); if (!emulator_io_permitted(ctxt, ctxt->src.val, ctxt->dst.bytes)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; } static int check_perm_out(struct x86_emulate_ctxt *ctxt) { ctxt->src.bytes = min(ctxt->src.bytes, 4u); if (!emulator_io_permitted(ctxt, ctxt->dst.val, ctxt->src.bytes)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; } #define D(_y) { .flags = (_y) } #define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i } #define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \ .intercept = x86_intercept_##_i, .check_perm = (_p) } #define N D(NotImpl) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } #define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) } #define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) } #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define II(_f, _e, _i) \ { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i } #define IIP(_f, _e, _i, _p) \ { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \ .intercept = x86_intercept_##_i, .check_perm = (_p) } #define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) #define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e) #define I2bvIP(_f, _e, _i, _p) \ IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) #define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) static const struct opcode group7_rm0[] = { N, I(SrcNone | Priv | EmulateOnUD, em_hypercall), N, N, N, N, N, N, }; static const struct opcode group7_rm1[] = { DI(SrcNone | Priv, monitor), DI(SrcNone | Priv, mwait), N, N, N, N, N, N, }; static const struct opcode group7_rm2[] = { N, II(ImplicitOps | Priv, em_xsetbv, xsetbv), N, N, N, N, N, N, }; static const struct opcode group7_rm3[] = { DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall), DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), DIP(SrcNone | Prot | Priv, stgi, check_svme), DIP(SrcNone | Prot | Priv, clgi, check_svme), DIP(SrcNone | Prot | Priv, skinit, check_svme), DIP(SrcNone | Prot | Priv, invlpga, check_svme), }; static const struct opcode group7_rm7[] = { N, DIP(SrcNone, rdtscp, check_rdtsc), N, N, N, N, N, N, }; static const struct opcode group1[] = { I(Lock, em_add), I(Lock | PageTable, em_or), I(Lock, em_adc), I(Lock, em_sbb), I(Lock | PageTable, em_and), I(Lock, em_sub), I(Lock, em_xor), I(NoWrite, em_cmp), }; static const struct opcode group1A[] = { I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N, }; static const struct opcode group2[] = { I(DstMem | ModRM, em_rol), I(DstMem | ModRM, em_ror), I(DstMem | ModRM, em_rcl), I(DstMem | ModRM, em_rcr), I(DstMem | ModRM, em_shl), I(DstMem | ModRM, em_shr), I(DstMem | ModRM, em_shl), I(DstMem | ModRM, em_sar), }; static const struct opcode group3[] = { I(DstMem | SrcImm | NoWrite, em_test), I(DstMem | SrcImm | NoWrite, em_test), I(DstMem | SrcNone | Lock, em_not), I(DstMem | SrcNone | Lock, em_neg), I(DstXacc | Src2Mem, em_mul_ex), I(DstXacc | Src2Mem, em_imul_ex), I(DstXacc | Src2Mem, em_div_ex), I(DstXacc | Src2Mem, em_idiv_ex), }; static const struct opcode group4[] = { I(ByteOp | DstMem | SrcNone | Lock, em_inc), I(ByteOp | DstMem | SrcNone | Lock, em_dec), N, N, N, N, N, N, }; static const struct opcode group5[] = { I(DstMem | SrcNone | Lock, em_inc), I(DstMem | SrcNone | Lock, em_dec), I(SrcMem | NearBranch | IsBranch | ShadowStack, em_call_near_abs), I(SrcMemFAddr | ImplicitOps | IsBranch | ShadowStack, em_call_far), I(SrcMem | NearBranch | IsBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far), I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), }; static const struct opcode group6[] = { II(Prot | DstMem, em_sldt, sldt), II(Prot | DstMem, em_str, str), II(Prot | Priv | SrcMem16, em_lldt, lldt), II(Prot | Priv | SrcMem16, em_ltr, ltr), N, N, N, N, }; static const struct group_dual group7 = { { II(Mov | DstMem, em_sgdt, sgdt), II(Mov | DstMem, em_sidt, sidt), II(SrcMem | Priv, em_lgdt, lgdt), II(SrcMem | Priv, em_lidt, lidt), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { EXT(0, group7_rm0), EXT(0, group7_rm1), EXT(0, group7_rm2), EXT(0, group7_rm3), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), } }; static const struct opcode group8[] = { N, N, N, N, I(DstMem | SrcImmByte | NoWrite, em_bt), I(DstMem | SrcImmByte | Lock | PageTable, em_bts), I(DstMem | SrcImmByte | Lock, em_btr), I(DstMem | SrcImmByte | Lock | PageTable, em_btc), }; /* * The "memory" destination is actually always a register, since we come * from the register case of group9. */ static const struct gprefix pfx_0f_c7_7 = { N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid), }; static const struct group_dual group9 = { { N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, }, { N, N, N, N, N, N, N, GP(0, &pfx_0f_c7_7), } }; static const struct opcode group11[] = { I(DstMem | SrcImm | Mov | PageTable, em_mov), X7(D(Undefined)), }; static const struct gprefix pfx_0f_ae_7 = { I(SrcMem | ByteOp, em_clflush), I(SrcMem | ByteOp, em_clflushopt), N, N, }; static const struct group_dual group15 = { { I(ModRM | Aligned16, em_fxsave), I(ModRM | Aligned16, em_fxrstor), N, N, N, N, N, GP(0, &pfx_0f_ae_7), }, { N, N, N, N, N, N, N, N, } }; static const struct gprefix pfx_0f_6f_0f_7f = { I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; static const struct instr_dual instr_dual_0f_2b = { I(0, em_mov), N }; static const struct gprefix pfx_0f_2b = { ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N, }; static const struct gprefix pfx_0f_10_0f_11 = { I(Unaligned, em_mov), I(Unaligned, em_mov), N, N, }; static const struct gprefix pfx_0f_28_0f_29 = { I(Aligned, em_mov), I(Aligned, em_mov), N, N, }; static const struct gprefix pfx_0f_e7 = { N, I(Sse, em_mov), N, N, }; static const struct escape escape_d9 = { { N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw), }, { /* 0xC0 - 0xC7 */ N, N, N, N, N, N, N, N, /* 0xC8 - 0xCF */ N, N, N, N, N, N, N, N, /* 0xD0 - 0xC7 */ N, N, N, N, N, N, N, N, /* 0xD8 - 0xDF */ N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ N, N, N, N, N, N, N, N, /* 0xE8 - 0xEF */ N, N, N, N, N, N, N, N, /* 0xF0 - 0xF7 */ N, N, N, N, N, N, N, N, /* 0xF8 - 0xFF */ N, N, N, N, N, N, N, N, } }; static const struct escape escape_db = { { N, N, N, N, N, N, N, N, }, { /* 0xC0 - 0xC7 */ N, N, N, N, N, N, N, N, /* 0xC8 - 0xCF */ N, N, N, N, N, N, N, N, /* 0xD0 - 0xC7 */ N, N, N, N, N, N, N, N, /* 0xD8 - 0xDF */ N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ N, N, N, I(ImplicitOps, em_fninit), N, N, N, N, /* 0xE8 - 0xEF */ N, N, N, N, N, N, N, N, /* 0xF0 - 0xF7 */ N, N, N, N, N, N, N, N, /* 0xF8 - 0xFF */ N, N, N, N, N, N, N, N, } }; static const struct escape escape_dd = { { N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw), }, { /* 0xC0 - 0xC7 */ N, N, N, N, N, N, N, N, /* 0xC8 - 0xCF */ N, N, N, N, N, N, N, N, /* 0xD0 - 0xC7 */ N, N, N, N, N, N, N, N, /* 0xD8 - 0xDF */ N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ N, N, N, N, N, N, N, N, /* 0xE8 - 0xEF */ N, N, N, N, N, N, N, N, /* 0xF0 - 0xF7 */ N, N, N, N, N, N, N, N, /* 0xF8 - 0xFF */ N, N, N, N, N, N, N, N, } }; static const struct instr_dual instr_dual_0f_c3 = { I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N }; static const struct mode_dual mode_dual_63 = { N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd) }; static const struct instr_dual instr_dual_8d = { D(DstReg | SrcMem | ModRM | NoAccess), N }; static const struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ I6ALU(Lock, em_add), I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), /* 0x08 - 0x0F */ I6ALU(Lock | PageTable, em_or), I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), N, /* 0x10 - 0x17 */ I6ALU(Lock, em_adc), I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), /* 0x18 - 0x1F */ I6ALU(Lock, em_sbb), I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), /* 0x20 - 0x27 */ I6ALU(Lock | PageTable, em_and), N, N, /* 0x28 - 0x2F */ I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ I6ALU(Lock, em_xor), N, N, /* 0x38 - 0x3F */ I6ALU(NoWrite, em_cmp), N, N, /* 0x40 - 0x4F */ X8(I(DstReg, em_inc)), X8(I(DstReg, em_dec)), /* 0x50 - 0x57 */ X8(I(SrcReg | Stack, em_push)), /* 0x58 - 0x5F */ X8(I(DstReg | Stack, em_pop)), /* 0x60 - 0x67 */ I(ImplicitOps | Stack | No64, em_pusha), I(ImplicitOps | Stack | No64, em_popa), N, MD(ModRM, &mode_dual_63), N, N, N, N, /* 0x68 - 0x6F */ I(SrcImm | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), I(SrcImmByte | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ X16(D(SrcImmByte | NearBranch | IsBranch)), /* 0x80 - 0x87 */ G(ByteOp | DstMem | SrcImm, group1), G(DstMem | SrcImm, group1), G(ByteOp | DstMem | SrcImm | No64, group1), G(DstMem | SrcImmByte, group1), I2bv(DstMem | SrcReg | ModRM | NoWrite, em_test), I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), /* 0x88 - 0x8F */ I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg), ID(0, &instr_dual_8d), I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), G(0, group1A), /* 0x90 - 0x97 */ DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64 | IsBranch | ShadowStack, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), II(ImplicitOps | Stack, em_popf, popf), I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf), /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov), I2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r), /* 0xA8 - 0xAF */ I2bv(DstAcc | SrcImm | NoWrite, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstAcc | Mov | String, em_mov), I2bv(SrcAcc | DstDI | String | NoWrite, em_cmp_r), /* 0xB0 - 0xB7 */ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), /* 0xB8 - 0xBF */ X8(I(DstReg | SrcImm64 | Mov, em_mov)), /* 0xC0 - 0xC7 */ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch | ShadowStack, em_ret_near_imm), I(ImplicitOps | NearBranch | IsBranch | ShadowStack, em_ret), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), I(ImplicitOps | SrcImmU16 | IsBranch | ShadowStack, em_ret_far_imm), I(ImplicitOps | IsBranch | ShadowStack, em_ret_far), D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch | ShadowStack, intn), D(ImplicitOps | No64 | IsBranch), II(ImplicitOps | IsBranch | ShadowStack, em_iret, iret), /* 0xD0 - 0xD7 */ G(Src2One | ByteOp, group2), G(Src2One, group2), G(Src2CL | ByteOp, group2), G(Src2CL, group2), I(DstAcc | SrcImmUByte | No64, em_aam), I(DstAcc | SrcImmUByte | No64, em_aad), I(DstAcc | ByteOp | No64, em_salc), I(DstAcc | SrcXLat | ByteOp, em_mov), /* 0xD8 - 0xDF */ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N, /* 0xE0 - 0xE7 */ X3(I(SrcImmByte | NearBranch | IsBranch, em_loop)), I(SrcImmByte | NearBranch | IsBranch, em_jcxz), I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), /* 0xE8 - 0xEF */ I(SrcImm | NearBranch | IsBranch | ShadowStack, em_call), D(SrcImm | ImplicitOps | NearBranch | IsBranch), I(SrcImmFAddr | No64 | IsBranch, em_jmp_far), D(SrcImmByte | ImplicitOps | NearBranch | IsBranch), I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out), /* 0xF0 - 0xF7 */ N, DI(ImplicitOps, icebp), N, N, DI(ImplicitOps | Priv, hlt), D(ImplicitOps), G(ByteOp, group3), G(0, group3), /* 0xF8 - 0xFF */ D(ImplicitOps), D(ImplicitOps), I(ImplicitOps, em_cli), I(ImplicitOps, em_sti), D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), }; static const struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, N, I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, /* 0x10 - 0x1F */ GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11), GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11), N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */ D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* NOP + 7 * reserved NOP */ /* 0x20 - 0x2F */ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_access), DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write, check_cr_access), IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, check_dr_write), N, N, N, N, GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b), N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_sysenter), I(ImplicitOps | Priv | EmulateOnUD | IsBranch | ShadowStack, em_sysexit), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ X16(D(DstReg | SrcMem | ModRM)), /* 0x50 - 0x5F */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0x60 - 0x6F */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x70 - 0x7F */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x80 - 0x8F */ X16(D(SrcImm | NearBranch | IsBranch)), /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt), I(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld), I(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), II(EmulateOnUD | ImplicitOps, em_rsm, rsm), I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), I(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), I(DstMem | SrcReg | Src2CL | ModRM, em_shrd), GD(0, &group15), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, G(BitOp, group8), I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), I(DstReg | SrcMem | ModRM, em_bsf_c), I(DstReg | SrcMem | ModRM, em_bsr_c), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xC7 */ I2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), N, ID(0, &instr_dual_0f_c3), N, N, N, GD(0, &group9), /* 0xC8 - 0xCF */ X8(I(DstReg, em_bswap)), /* 0xD0 - 0xDF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xE0 - 0xEF */ N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7), N, N, N, N, N, N, N, N, /* 0xF0 - 0xFF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; static const struct instr_dual instr_dual_0f_38_f0 = { I(DstReg | SrcMem | Mov, em_movbe), N }; static const struct instr_dual instr_dual_0f_38_f1 = { I(DstMem | SrcReg | Mov, em_movbe), N }; static const struct gprefix three_byte_0f_38_f0 = { ID(0, &instr_dual_0f_38_f0), ID(0, &instr_dual_0f_38_f0), N, N }; static const struct gprefix three_byte_0f_38_f1 = { ID(0, &instr_dual_0f_38_f1), ID(0, &instr_dual_0f_38_f1), N, N }; /* * Insns below are selected by the prefix which indexed by the third opcode * byte. */ static const struct opcode opcode_map_0f_38[256] = { /* 0x00 - 0x7f */ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), /* 0x80 - 0xef */ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), /* 0xf0 - 0xf1 */ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0), GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1), /* 0xf2 - 0xff */ N, N, X4(N), X8(N) }; #undef D #undef N #undef G #undef GD #undef I #undef GP #undef EXT #undef MD #undef ID #undef D2bv #undef D2bvIP #undef I2bv #undef I2bvIP #undef I6ALU static bool is_shstk_instruction(struct x86_emulate_ctxt *ctxt) { return ctxt->d & ShadowStack; } static bool is_ibt_instruction(struct x86_emulate_ctxt *ctxt) { u64 flags = ctxt->d; if (!(flags & IsBranch)) return false; /* * All far JMPs and CALLs (including SYSCALL, SYSENTER, and INTn) are * indirect and thus affect IBT state. All far RETs (including SYSEXIT * and IRET) are protected via Shadow Stacks and thus don't affect IBT * state. IRET #GPs when returning to virtual-8086 and IBT or SHSTK is * enabled, but that should be handled by IRET emulation (in the very * unlikely scenario that KVM adds support for fully emulating IRET). */ if (!(flags & NearBranch)) return ctxt->execute != em_iret && ctxt->execute != em_ret_far && ctxt->execute != em_ret_far_imm && ctxt->execute != em_sysexit; switch (flags & SrcMask) { case SrcReg: case SrcMem: case SrcMem16: case SrcMem32: return true; case SrcMemFAddr: case SrcImmFAddr: /* Far branches should be handled above. */ WARN_ON_ONCE(1); return true; case SrcNone: case SrcImm: case SrcImmByte: /* * Note, ImmU16 is used only for the stack adjustment operand on ENTER * and RET instructions. ENTER isn't a branch and RET FAR is handled * by the NearBranch check above. RET itself isn't an indirect branch. */ case SrcImmU16: return false; default: WARN_ONCE(1, "Unexpected Src operand '%llx' on branch", flags & SrcMask); return false; } } static unsigned imm_size(struct x86_emulate_ctxt *ctxt) { unsigned size; size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; if (size == 8) size = 4; return size; } static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, unsigned size, bool sign_extension) { int rc = X86EMUL_CONTINUE; op->type = OP_IMM; op->bytes = size; op->addr.mem.ea = ctxt->_eip; /* NB. Immediates are sign-extended as necessary. */ switch (op->bytes) { case 1: op->val = insn_fetch(s8, ctxt); break; case 2: op->val = insn_fetch(s16, ctxt); break; case 4: op->val = insn_fetch(s32, ctxt); break; case 8: op->val = insn_fetch(s64, ctxt); break; } if (!sign_extension) { switch (op->bytes) { case 1: op->val &= 0xff; break; case 2: op->val &= 0xffff; break; case 4: op->val &= 0xffffffff; break; } } done: return rc; } static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, unsigned d) { int rc = X86EMUL_CONTINUE; switch (d) { case OpReg: decode_register_operand(ctxt, op); break; case OpImmUByte: rc = decode_imm(ctxt, op, 1, false); break; case OpMem: ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; mem_common: *op = ctxt->memop; ctxt->memopp = op; if (ctxt->d & BitOp) fetch_bit_operand(ctxt); op->orig_val = op->val; break; case OpMem64: ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8; goto mem_common; case OpAcc: op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); fetch_register_operand(op); op->orig_val = op->val; break; case OpAccLo: op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes; op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); fetch_register_operand(op); op->orig_val = op->val; break; case OpAccHi: if (ctxt->d & ByteOp) { op->type = OP_NONE; break; } op->type = OP_REG; op->bytes = ctxt->op_bytes; op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); fetch_register_operand(op); op->orig_val = op->val; break; case OpDI: op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = register_address(ctxt, VCPU_REGS_RDI); op->addr.mem.seg = VCPU_SREG_ES; op->val = 0; op->count = 1; break; case OpDX: op->type = OP_REG; op->bytes = 2; op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); fetch_register_operand(op); break; case OpCL: op->type = OP_IMM; op->bytes = 1; op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff; break; case OpImmByte: rc = decode_imm(ctxt, op, 1, true); break; case OpOne: op->type = OP_IMM; op->bytes = 1; op->val = 1; break; case OpImm: rc = decode_imm(ctxt, op, imm_size(ctxt), true); break; case OpImm64: rc = decode_imm(ctxt, op, ctxt->op_bytes, true); break; case OpMem8: ctxt->memop.bytes = 1; if (ctxt->memop.type == OP_REG) { ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, true); fetch_register_operand(&ctxt->memop); } goto mem_common; case OpMem16: ctxt->memop.bytes = 2; goto mem_common; case OpMem32: ctxt->memop.bytes = 4; goto mem_common; case OpImmU16: rc = decode_imm(ctxt, op, 2, false); break; case OpImmU: rc = decode_imm(ctxt, op, imm_size(ctxt), false); break; case OpSI: op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = register_address(ctxt, VCPU_REGS_RSI); op->addr.mem.seg = ctxt->seg_override; op->val = 0; op->count = 1; break; case OpXLat: op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RBX) + (reg_read(ctxt, VCPU_REGS_RAX) & 0xff)); op->addr.mem.seg = ctxt->seg_override; op->val = 0; break; case OpImmFAddr: op->type = OP_IMM; op->addr.mem.ea = ctxt->_eip; op->bytes = ctxt->op_bytes + 2; insn_fetch_arr(op->valptr, op->bytes, ctxt); break; case OpMemFAddr: ctxt->memop.bytes = ctxt->op_bytes + 2; goto mem_common; case OpES: op->type = OP_IMM; op->val = VCPU_SREG_ES; break; case OpCS: op->type = OP_IMM; op->val = VCPU_SREG_CS; break; case OpSS: op->type = OP_IMM; op->val = VCPU_SREG_SS; break; case OpDS: op->type = OP_IMM; op->val = VCPU_SREG_DS; break; case OpFS: op->type = OP_IMM; op->val = VCPU_SREG_FS; break; case OpGS: op->type = OP_IMM; op->val = VCPU_SREG_GS; break; case OpImplicit: /* Special instructions do their own operand decoding. */ default: op->type = OP_NONE; /* Disable writeback. */ break; } done: return rc; } int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) { int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, goffset, simd_prefix; bool op_prefix = false; bool has_seg_override = false; struct opcode opcode; u16 dummy; struct desc_struct desc; ctxt->memop.type = OP_NONE; ctxt->memopp = NULL; ctxt->_eip = ctxt->eip; ctxt->fetch.ptr = ctxt->fetch.data; ctxt->fetch.end = ctxt->fetch.data + insn_len; ctxt->opcode_len = 1; ctxt->intercept = x86_intercept_none; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); else { rc = __do_insn_fetch_bytes(ctxt, 1); if (rc != X86EMUL_CONTINUE) goto done; } switch (mode) { case X86EMUL_MODE_REAL: case X86EMUL_MODE_VM86: def_op_bytes = def_ad_bytes = 2; ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS); if (desc.d) def_op_bytes = def_ad_bytes = 4; break; case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; case X86EMUL_MODE_PROT32: def_op_bytes = def_ad_bytes = 4; break; #ifdef CONFIG_X86_64 case X86EMUL_MODE_PROT64: def_op_bytes = 4; def_ad_bytes = 8; break; #endif default: return EMULATION_FAILED; } ctxt->op_bytes = def_op_bytes; ctxt->ad_bytes = def_ad_bytes; /* Legacy prefixes. */ for (;;) { switch (ctxt->b = insn_fetch(u8, ctxt)) { case 0x66: /* operand-size override */ op_prefix = true; /* switch between 2/4 bytes */ ctxt->op_bytes = def_op_bytes ^ 6; break; case 0x67: /* address-size override */ if (mode == X86EMUL_MODE_PROT64) /* switch between 4/8 bytes */ ctxt->ad_bytes = def_ad_bytes ^ 12; else /* switch between 2/4 bytes */ ctxt->ad_bytes = def_ad_bytes ^ 6; break; case 0x26: /* ES override */ has_seg_override = true; ctxt->seg_override = VCPU_SREG_ES; break; case 0x2e: /* CS override */ has_seg_override = true; ctxt->seg_override = VCPU_SREG_CS; break; case 0x36: /* SS override */ has_seg_override = true; ctxt->seg_override = VCPU_SREG_SS; break; case 0x3e: /* DS override */ has_seg_override = true; ctxt->seg_override = VCPU_SREG_DS; break; case 0x64: /* FS override */ has_seg_override = true; ctxt->seg_override = VCPU_SREG_FS; break; case 0x65: /* GS override */ has_seg_override = true; ctxt->seg_override = VCPU_SREG_GS; break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) goto done_prefixes; ctxt->rex_prefix = ctxt->b; continue; case 0xf0: /* LOCK */ ctxt->lock_prefix = 1; break; case 0xf2: /* REPNE/REPNZ */ case 0xf3: /* REP/REPE/REPZ */ ctxt->rep_prefix = ctxt->b; break; default: goto done_prefixes; } /* Any legacy prefix after a REX prefix nullifies its effect. */ ctxt->rex_prefix = 0; } done_prefixes: /* REX prefix. */ if (ctxt->rex_prefix & 8) ctxt->op_bytes = 8; /* REX.W */ /* Opcode byte(s). */ opcode = opcode_table[ctxt->b]; /* Two-byte opcode? */ if (ctxt->b == 0x0f) { ctxt->opcode_len = 2; ctxt->b = insn_fetch(u8, ctxt); opcode = twobyte_table[ctxt->b]; /* 0F_38 opcode map */ if (ctxt->b == 0x38) { ctxt->opcode_len = 3; ctxt->b = insn_fetch(u8, ctxt); opcode = opcode_map_0f_38[ctxt->b]; } } ctxt->d = opcode.flags; if (ctxt->d & ModRM) ctxt->modrm = insn_fetch(u8, ctxt); /* vex-prefix instructions are not implemented */ if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) { ctxt->d = NotImpl; } while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: goffset = (ctxt->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: goffset = (ctxt->modrm >> 3) & 7; if ((ctxt->modrm >> 6) == 3) opcode = opcode.u.gdual->mod3[goffset]; else opcode = opcode.u.gdual->mod012[goffset]; break; case RMExt: goffset = ctxt->modrm & 7; opcode = opcode.u.group[goffset]; break; case Prefix: if (ctxt->rep_prefix && op_prefix) return EMULATION_FAILED; simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; switch (simd_prefix) { case 0x00: opcode = opcode.u.gprefix->pfx_no; break; case 0x66: opcode = opcode.u.gprefix->pfx_66; break; case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break; case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; } break; case Escape: if (ctxt->modrm > 0xbf) { size_t size = ARRAY_SIZE(opcode.u.esc->high); u32 index = array_index_nospec( ctxt->modrm - 0xc0, size); opcode = opcode.u.esc->high[index]; } else { opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; } break; case InstrDual: if ((ctxt->modrm >> 6) == 3) opcode = opcode.u.idual->mod3; else opcode = opcode.u.idual->mod012; break; case ModeDual: if (ctxt->mode == X86EMUL_MODE_PROT64) opcode = opcode.u.mdual->mode64; else opcode = opcode.u.mdual->mode32; break; default: return EMULATION_FAILED; } ctxt->d &= ~(u64)GroupMask; ctxt->d |= opcode.flags; } ctxt->is_branch = opcode.flags & IsBranch; /* Unrecognised? */ if (ctxt->d == 0) return EMULATION_FAILED; ctxt->execute = opcode.u.execute; /* * Reject emulation if KVM might need to emulate shadow stack updates * and/or indirect branch tracking enforcement, which the emulator * doesn't support. */ if ((is_ibt_instruction(ctxt) || is_shstk_instruction(ctxt)) && ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET) { u64 u_cet = 0, s_cet = 0; /* * Check both User and Supervisor on far transfers as inter- * privilege level transfers are impacted by CET at the target * privilege level, and that is not known at this time. The * expectation is that the guest will not require emulation of * any CET-affected instructions at any privilege level. */ if (!(ctxt->d & NearBranch)) u_cet = s_cet = CET_SHSTK_EN | CET_ENDBR_EN; else if (ctxt->ops->cpl(ctxt) == 3) u_cet = CET_SHSTK_EN | CET_ENDBR_EN; else s_cet = CET_SHSTK_EN | CET_ENDBR_EN; if ((u_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_U_CET, &u_cet)) || (s_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_S_CET, &s_cet))) return EMULATION_FAILED; if ((u_cet | s_cet) & CET_SHSTK_EN && is_shstk_instruction(ctxt)) return EMULATION_FAILED; if ((u_cet | s_cet) & CET_ENDBR_EN && is_ibt_instruction(ctxt)) return EMULATION_FAILED; } if (unlikely(emulation_type & EMULTYPE_TRAP_UD) && likely(!(ctxt->d & EmulateOnUD))) return EMULATION_FAILED; if (unlikely(ctxt->d & (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm|NearBranch| No16))) { /* * These are copied unconditionally here, and checked unconditionally * in x86_emulate_insn. */ ctxt->check_perm = opcode.check_perm; ctxt->intercept = opcode.intercept; if (ctxt->d & NotImpl) return EMULATION_FAILED; if (mode == X86EMUL_MODE_PROT64) { if (ctxt->op_bytes == 4 && (ctxt->d & Stack)) ctxt->op_bytes = 8; else if (ctxt->d & NearBranch) ctxt->op_bytes = 8; } if (ctxt->d & Op3264) { if (mode == X86EMUL_MODE_PROT64) ctxt->op_bytes = 8; else ctxt->op_bytes = 4; } if ((ctxt->d & No16) && ctxt->op_bytes == 2) ctxt->op_bytes = 4; if (ctxt->d & Sse) ctxt->op_bytes = 16; else if (ctxt->d & Mmx) ctxt->op_bytes = 8; } /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { rc = decode_modrm(ctxt, &ctxt->memop); if (!has_seg_override) { has_seg_override = true; ctxt->seg_override = ctxt->modrm_seg; } } else if (ctxt->d & MemAbs) rc = decode_abs(ctxt, &ctxt->memop); if (rc != X86EMUL_CONTINUE) goto done; if (!has_seg_override) ctxt->seg_override = VCPU_SREG_DS; ctxt->memop.addr.mem.seg = ctxt->seg_override; /* * Decode and fetch the source operand: register, memory * or immediate. */ rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask); if (rc != X86EMUL_CONTINUE) goto done; /* * Decode and fetch the second source operand: register, memory * or immediate. */ rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask); if (rc != X86EMUL_CONTINUE) goto done; /* Decode and fetch the destination operand: register or memory. */ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); if (ctxt->rip_relative && likely(ctxt->memopp)) ctxt->memopp->addr.mem.ea = address_mask(ctxt, ctxt->memopp->addr.mem.ea + ctxt->_eip); done: if (rc == X86EMUL_PROPAGATE_FAULT) ctxt->have_exception = true; return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt) { return ctxt->d & PageTable; } static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) { /* The second termination condition only applies for REPE * and REPNE. Test if the repeat string operation prefix is * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the * corresponding termination condition according to: * - if REPE/REPZ and ZF = 0 then done * - if REPNE/REPNZ and ZF = 1 then done */ if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) || (ctxt->b == 0xae) || (ctxt->b == 0xaf)) && (((ctxt->rep_prefix == REPE_PREFIX) && ((ctxt->eflags & X86_EFLAGS_ZF) == 0)) || ((ctxt->rep_prefix == REPNE_PREFIX) && ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF)))) return true; return false; } static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { int rc; kvm_fpu_get(); rc = asm_safe("fwait"); kvm_fpu_put(); if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); return X86EMUL_CONTINUE; } static void fetch_possible_mmx_operand(struct operand *op) { if (op->type == OP_MM) kvm_read_mmx_reg(op->addr.mm, &op->mm_val); } void init_decode_cache(struct x86_emulate_ctxt *ctxt) { /* Clear fields that are set conditionally but read without a guard. */ ctxt->rip_relative = false; ctxt->rex_prefix = 0; ctxt->lock_prefix = 0; ctxt->rep_prefix = 0; ctxt->regs_valid = 0; ctxt->regs_dirty = 0; ctxt->io_read.pos = 0; ctxt->io_read.end = 0; ctxt->mem_read.end = 0; } int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts) { const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; ctxt->mem_read.pos = 0; /* LOCK prefix is allowed only with some instructions */ if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) { rc = emulate_ud(ctxt); goto done; } if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) { rc = emulate_ud(ctxt); goto done; } if (unlikely(ctxt->d & (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || (ctxt->d & Undefined)) { rc = emulate_ud(ctxt); goto done; } if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { rc = emulate_ud(ctxt); goto done; } if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { rc = emulate_nm(ctxt); goto done; } if (ctxt->d & Mmx) { rc = flush_pending_x87_faults(ctxt); if (rc != X86EMUL_CONTINUE) goto done; /* * Now that we know the fpu is exception safe, we can fetch * operands from it. */ fetch_possible_mmx_operand(&ctxt->src); fetch_possible_mmx_operand(&ctxt->src2); if (!(ctxt->d & Mov)) fetch_possible_mmx_operand(&ctxt->dst); } if (unlikely(check_intercepts) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) goto done; } /* Instruction can only be executed in protected mode */ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { rc = emulate_ud(ctxt); goto done; } /* Privileged instruction can be executed only in CPL=0 */ if ((ctxt->d & Priv) && ops->cpl(ctxt)) { if (ctxt->d & PrivUD) rc = emulate_ud(ctxt); else rc = emulate_gp(ctxt, 0); goto done; } /* Do instruction specific permission checks */ if (ctxt->d & CheckPerm) { rc = ctxt->check_perm(ctxt); if (rc != X86EMUL_CONTINUE) goto done; } if (unlikely(check_intercepts) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) goto done; } if (ctxt->rep_prefix && (ctxt->d & String)) { /* All REP prefixes have the same first termination condition */ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { string_registers_quirk(ctxt); ctxt->eip = ctxt->_eip; ctxt->eflags &= ~X86_EFLAGS_RF; goto done; } } } if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) { rc = segmented_read(ctxt, ctxt->src.addr.mem, ctxt->src.valptr, ctxt->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; ctxt->src.orig_val64 = ctxt->src.val64; } if (ctxt->src2.type == OP_MEM) { rc = segmented_read(ctxt, ctxt->src2.addr.mem, &ctxt->src2.val, ctxt->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; } if ((ctxt->d & DstMask) == ImplicitOps) goto special_insn; if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ rc = segmented_read(ctxt, ctxt->dst.addr.mem, &ctxt->dst.val, ctxt->dst.bytes); if (rc != X86EMUL_CONTINUE) { if (!(ctxt->d & NoWrite) && rc == X86EMUL_PROPAGATE_FAULT && ctxt->exception.vector == PF_VECTOR) ctxt->exception.error_code |= PFERR_WRITE_MASK; goto done; } } /* Copy full 64-bit value for CMPXCHG8B. */ ctxt->dst.orig_val64 = ctxt->dst.val64; special_insn: if (unlikely(check_intercepts) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) goto done; } if (ctxt->rep_prefix && (ctxt->d & String)) ctxt->eflags |= X86_EFLAGS_RF; else ctxt->eflags &= ~X86_EFLAGS_RF; if (ctxt->execute) { rc = ctxt->execute(ctxt); if (rc != X86EMUL_CONTINUE) goto done; goto writeback; } if (ctxt->opcode_len == 2) goto twobyte_insn; else if (ctxt->opcode_len == 3) goto threebyte_insn; switch (ctxt->b) { case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(ctxt->b, ctxt->eflags)) rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x8d: /* lea r16/r32, m */ ctxt->dst.val = ctxt->src.addr.mem.ea; break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX)) ctxt->dst.type = OP_NONE; else rc = em_xchg(ctxt); break; case 0x98: /* cbw/cwde/cdqe */ switch (ctxt->op_bytes) { case 2: ctxt->dst.val = (s8)ctxt->dst.val; break; case 4: ctxt->dst.val = (s16)ctxt->dst.val; break; case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; } break; case 0xcc: /* int3 */ rc = emulate_int(ctxt, 3); break; case 0xcd: /* int n */ rc = emulate_int(ctxt, ctxt->src.val); break; case 0xce: /* into */ if (ctxt->eflags & X86_EFLAGS_OF) rc = emulate_int(ctxt, 4); break; case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ rc = jmp_rel(ctxt, ctxt->src.val); ctxt->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ ctxt->ops->halt(ctxt); break; case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ ctxt->eflags ^= X86_EFLAGS_CF; break; case 0xf8: /* clc */ ctxt->eflags &= ~X86_EFLAGS_CF; break; case 0xf9: /* stc */ ctxt->eflags |= X86_EFLAGS_CF; break; case 0xfc: /* cld */ ctxt->eflags &= ~X86_EFLAGS_DF; break; case 0xfd: /* std */ ctxt->eflags |= X86_EFLAGS_DF; break; default: goto cannot_emulate; } if (rc != X86EMUL_CONTINUE) goto done; writeback: if (ctxt->d & SrcWrite) { BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR); rc = writeback(ctxt, &ctxt->src); if (rc != X86EMUL_CONTINUE) goto done; } if (!(ctxt->d & NoWrite)) { rc = writeback(ctxt, &ctxt->dst); if (rc != X86EMUL_CONTINUE) goto done; } /* * restore dst type in case the decoding will be reused * (happens for string instruction ) */ ctxt->dst.type = saved_dst_type; if ((ctxt->d & SrcMask) == SrcSI) string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src); if ((ctxt->d & DstMask) == DstDI) string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst); if (ctxt->rep_prefix && (ctxt->d & String)) { unsigned int count; struct read_cache *r = &ctxt->io_read; if ((ctxt->d & SrcMask) == SrcSI) count = ctxt->src.count; else count = ctxt->dst.count; register_address_increment(ctxt, VCPU_REGS_RCX, -count); if (!string_insn_completed(ctxt)) { /* * Re-enter guest when pio read ahead buffer is empty * or, if it is not used, after each 1024 iteration. */ if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) && (r->end == 0 || r->end != r->pos)) { /* * Reset read cache. Usually happens before * decode, but since instruction is restarted * we have to do it here. */ ctxt->mem_read.end = 0; writeback_registers(ctxt); return EMULATION_RESTART; } goto done; /* skip rip writeback */ } ctxt->eflags &= ~X86_EFLAGS_RF; } ctxt->eip = ctxt->_eip; if (ctxt->mode != X86EMUL_MODE_PROT64) ctxt->eip = (u32)ctxt->_eip; done: if (rc == X86EMUL_PROPAGATE_FAULT) { if (KVM_EMULATOR_BUG_ON(ctxt->exception.vector > 0x1f, ctxt)) return EMULATION_FAILED; ctxt->have_exception = true; } if (rc == X86EMUL_INTERCEPTED) return EMULATION_INTERCEPTED; if (rc == X86EMUL_CONTINUE) writeback_registers(ctxt); return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; twobyte_insn: switch (ctxt->b) { case 0x09: /* wbinvd */ (ctxt->ops->wbinvd)(ctxt); break; case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ case 0x1f: /* nop */ break; case 0x20: /* mov cr, reg */ ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg); break; case 0x21: /* mov from dr to reg */ ctxt->dst.val = ops->get_dr(ctxt, ctxt->modrm_reg); break; case 0x40 ... 0x4f: /* cmov */ if (test_cc(ctxt->b, ctxt->eflags)) ctxt->dst.val = ctxt->src.val; else if (ctxt->op_bytes != 4) ctxt->dst.type = OP_NONE; /* no writeback */ break; case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(ctxt->b, ctxt->eflags)) rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val : (u16) ctxt->src.val; break; case 0xbe ... 0xbf: /* movsx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; break; default: goto cannot_emulate; } threebyte_insn: if (rc != X86EMUL_CONTINUE) goto done; goto writeback; cannot_emulate: return EMULATION_FAILED; } void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt) { invalidate_registers(ctxt); } void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt) { writeback_registers(ctxt); } bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt) { if (ctxt->rep_prefix && (ctxt->d & String)) return false; if (ctxt->d & TwoMemOp) return false; return true; }
11 9 11 9 10 2 1 8 9 1 9 9 8 9 9 9 1 1 1 9 9 2 9 9 1 9 9 9 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 // SPDX-License-Identifier: GPL-2.0-only /* * vivid-radio-rx.c - radio receiver support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/delay.h> #include <linux/videodev2.h> #include <linux/v4l2-dv-timings.h> #include <linux/sched/signal.h> #include <media/v4l2-common.h> #include <media/v4l2-event.h> #include <media/v4l2-dv-timings.h> #include "vivid-core.h" #include "vivid-ctrls.h" #include "vivid-radio-common.h" #include "vivid-rds-gen.h" #include "vivid-radio-rx.h" ssize_t vivid_radio_rx_read(struct file *file, char __user *buf, size_t size, loff_t *offset) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_rds_data *data = dev->rds_gen.data; bool use_alternates; ktime_t timestamp; unsigned blk; int perc; int i; if (dev->radio_rx_rds_controls) return -EINVAL; if (size < sizeof(*data)) return 0; size = sizeof(*data) * (size / sizeof(*data)); if (mutex_lock_interruptible(&dev->mutex)) return -ERESTARTSYS; if (dev->radio_rx_rds_owner && file_to_v4l2_fh(file) != dev->radio_rx_rds_owner) { mutex_unlock(&dev->mutex); return -EBUSY; } if (dev->radio_rx_rds_owner == NULL) { vivid_radio_rds_init(dev); dev->radio_rx_rds_owner = file_to_v4l2_fh(file); } retry: timestamp = ktime_sub(ktime_get(), dev->radio_rds_init_time); blk = ktime_divns(timestamp, VIVID_RDS_NSEC_PER_BLK); use_alternates = (blk % VIVID_RDS_GEN_BLOCKS) & 1; if (dev->radio_rx_rds_last_block == 0 || dev->radio_rx_rds_use_alternates != use_alternates) { dev->radio_rx_rds_use_alternates = use_alternates; /* Re-init the RDS generator */ vivid_radio_rds_init(dev); } if (blk >= dev->radio_rx_rds_last_block + VIVID_RDS_GEN_BLOCKS) dev->radio_rx_rds_last_block = blk - VIVID_RDS_GEN_BLOCKS + 1; /* * No data is available if there hasn't been time to get new data, * or if the RDS receiver has been disabled, or if we use the data * from the RDS transmitter and that RDS transmitter has been disabled, * or if the signal quality is too weak. */ if (blk == dev->radio_rx_rds_last_block || !dev->radio_rx_rds_enabled || (dev->radio_rds_loop && !(dev->radio_tx_subchans & V4L2_TUNER_SUB_RDS)) || abs(dev->radio_rx_sig_qual) > 200) { mutex_unlock(&dev->mutex); if (file->f_flags & O_NONBLOCK) return -EWOULDBLOCK; if (msleep_interruptible(20) && signal_pending(current)) return -EINTR; if (mutex_lock_interruptible(&dev->mutex)) return -ERESTARTSYS; goto retry; } /* abs(dev->radio_rx_sig_qual) <= 200, map that to a 0-50% range */ perc = abs(dev->radio_rx_sig_qual) / 4; for (i = 0; i < size && blk > dev->radio_rx_rds_last_block; dev->radio_rx_rds_last_block++) { unsigned data_blk = dev->radio_rx_rds_last_block % VIVID_RDS_GEN_BLOCKS; struct v4l2_rds_data rds = data[data_blk]; if (data_blk == 0 && dev->radio_rds_loop) vivid_radio_rds_init(dev); if (perc && get_random_u32_below(100) < perc) { switch (get_random_u32_below(4)) { case 0: rds.block |= V4L2_RDS_BLOCK_CORRECTED; break; case 1: rds.block |= V4L2_RDS_BLOCK_INVALID; break; case 2: rds.block |= V4L2_RDS_BLOCK_ERROR; rds.lsb = get_random_u8(); rds.msb = get_random_u8(); break; case 3: /* Skip block altogether */ if (i) continue; /* * Must make sure at least one block is * returned, otherwise the application * might think that end-of-file occurred. */ break; } } if (copy_to_user(buf + i, &rds, sizeof(rds))) { i = -EFAULT; break; } i += sizeof(rds); } mutex_unlock(&dev->mutex); return i; } __poll_t vivid_radio_rx_poll(struct file *file, struct poll_table_struct *wait) { return EPOLLIN | EPOLLRDNORM | v4l2_ctrl_poll(file, wait); } int vivid_radio_rx_enum_freq_bands(struct file *file, void *priv, struct v4l2_frequency_band *band) { if (band->tuner != 0) return -EINVAL; if (band->index >= TOT_BANDS) return -EINVAL; *band = vivid_radio_bands[band->index]; return 0; } int vivid_radio_rx_s_hw_freq_seek(struct file *file, void *priv, const struct v4l2_hw_freq_seek *a) { struct vivid_dev *dev = video_drvdata(file); unsigned low, high; unsigned freq; unsigned spacing; unsigned band; if (a->tuner) return -EINVAL; if (a->wrap_around && dev->radio_rx_hw_seek_mode == VIVID_HW_SEEK_BOUNDED) return -EINVAL; if (!a->wrap_around && dev->radio_rx_hw_seek_mode == VIVID_HW_SEEK_WRAP) return -EINVAL; if (!a->rangelow ^ !a->rangehigh) return -EINVAL; if (file->f_flags & O_NONBLOCK) return -EWOULDBLOCK; if (a->rangelow) { for (band = 0; band < TOT_BANDS; band++) if (a->rangelow >= vivid_radio_bands[band].rangelow && a->rangehigh <= vivid_radio_bands[band].rangehigh) break; if (band == TOT_BANDS) return -EINVAL; if (!dev->radio_rx_hw_seek_prog_lim && (a->rangelow != vivid_radio_bands[band].rangelow || a->rangehigh != vivid_radio_bands[band].rangehigh)) return -EINVAL; low = a->rangelow; high = a->rangehigh; } else { for (band = 0; band < TOT_BANDS; band++) if (dev->radio_rx_freq >= vivid_radio_bands[band].rangelow && dev->radio_rx_freq <= vivid_radio_bands[band].rangehigh) break; if (band == TOT_BANDS) return -EINVAL; low = vivid_radio_bands[band].rangelow; high = vivid_radio_bands[band].rangehigh; } spacing = band == BAND_AM ? 1600 : 16000; freq = clamp(dev->radio_rx_freq, low, high); if (a->seek_upward) { freq = spacing * (freq / spacing) + spacing; if (freq > high) { if (!a->wrap_around) return -ENODATA; freq = spacing * (low / spacing) + spacing; if (freq >= dev->radio_rx_freq) return -ENODATA; } } else { freq = spacing * ((freq + spacing - 1) / spacing) - spacing; if (freq < low) { if (!a->wrap_around) return -ENODATA; freq = spacing * ((high + spacing - 1) / spacing) - spacing; if (freq <= dev->radio_rx_freq) return -ENODATA; } } return 0; } int vivid_radio_rx_g_tuner(struct file *file, void *priv, struct v4l2_tuner *vt) { struct vivid_dev *dev = video_drvdata(file); int delta = 800; int sig_qual; if (vt->index > 0) return -EINVAL; strscpy(vt->name, "AM/FM/SW Receiver", sizeof(vt->name)); vt->capability = V4L2_TUNER_CAP_LOW | V4L2_TUNER_CAP_STEREO | V4L2_TUNER_CAP_FREQ_BANDS | V4L2_TUNER_CAP_RDS | (dev->radio_rx_rds_controls ? V4L2_TUNER_CAP_RDS_CONTROLS : V4L2_TUNER_CAP_RDS_BLOCK_IO) | (dev->radio_rx_hw_seek_prog_lim ? V4L2_TUNER_CAP_HWSEEK_PROG_LIM : 0); switch (dev->radio_rx_hw_seek_mode) { case VIVID_HW_SEEK_BOUNDED: vt->capability |= V4L2_TUNER_CAP_HWSEEK_BOUNDED; break; case VIVID_HW_SEEK_WRAP: vt->capability |= V4L2_TUNER_CAP_HWSEEK_WRAP; break; case VIVID_HW_SEEK_BOTH: vt->capability |= V4L2_TUNER_CAP_HWSEEK_WRAP | V4L2_TUNER_CAP_HWSEEK_BOUNDED; break; } vt->rangelow = AM_FREQ_RANGE_LOW; vt->rangehigh = FM_FREQ_RANGE_HIGH; sig_qual = dev->radio_rx_sig_qual; vt->signal = abs(sig_qual) > delta ? 0 : 0xffff - ((unsigned)abs(sig_qual) * 0xffff) / delta; vt->afc = sig_qual > delta ? 0 : sig_qual; if (abs(sig_qual) > delta) vt->rxsubchans = 0; else if (dev->radio_rx_freq < FM_FREQ_RANGE_LOW || vt->signal < 0x8000) vt->rxsubchans = V4L2_TUNER_SUB_MONO; else if (dev->radio_rds_loop && !(dev->radio_tx_subchans & V4L2_TUNER_SUB_STEREO)) vt->rxsubchans = V4L2_TUNER_SUB_MONO; else vt->rxsubchans = V4L2_TUNER_SUB_STEREO; if (dev->radio_rx_rds_enabled && (!dev->radio_rds_loop || (dev->radio_tx_subchans & V4L2_TUNER_SUB_RDS)) && dev->radio_rx_freq >= FM_FREQ_RANGE_LOW && vt->signal >= 0xc000) vt->rxsubchans |= V4L2_TUNER_SUB_RDS; if (dev->radio_rx_rds_controls) vivid_radio_rds_init(dev); vt->audmode = dev->radio_rx_audmode; return 0; } int vivid_radio_rx_s_tuner(struct file *file, void *priv, const struct v4l2_tuner *vt) { struct vivid_dev *dev = video_drvdata(file); if (vt->index) return -EINVAL; dev->radio_rx_audmode = vt->audmode >= V4L2_TUNER_MODE_STEREO; return 0; }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 // SPDX-License-Identifier: GPL-2.0-only /* * MCE event pool management in MCE context * * Copyright (C) 2015 Intel Corp. * Author: Chen, Gong <gong.chen@linux.intel.com> */ #include <linux/smp.h> #include <linux/mm.h> #include <linux/genalloc.h> #include <linux/llist.h> #include "internal.h" /* * printk() is not safe in MCE context. This is a lock-less memory allocator * used to save error information organized in a lock-less list. * * This memory pool is only to be used to save MCE records in MCE context. * MCE events are rare, so a fixed size memory pool should be enough. * Allocate on a sliding scale based on number of CPUs. */ #define MCE_MIN_ENTRIES 80 #define MCE_PER_CPU 2 static struct gen_pool *mce_evt_pool; static LLIST_HEAD(mce_event_llist); /* * Compare the record "t" with each of the records on list "l" to see if * an equivalent one is present in the list. */ static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) { struct mce_hw_err *err1, *err2; struct mce_evt_llist *node; err1 = &t->err; llist_for_each_entry(node, &l->llnode, llnode) { err2 = &node->err; if (!mce_cmp(&err1->m, &err2->m)) return true; } return false; } /* * The system has panicked - we'd like to peruse the list of MCE records * that have been queued, but not seen by anyone yet. The list is in * reverse time order, so we need to reverse it. While doing that we can * also drop duplicate records (these were logged because some banks are * shared between cores or by all threads on a socket). */ struct llist_node *mce_gen_pool_prepare_records(void) { struct llist_node *head; LLIST_HEAD(new_head); struct mce_evt_llist *node, *t; head = llist_del_all(&mce_event_llist); if (!head) return NULL; /* squeeze out duplicates while reversing order */ llist_for_each_entry_safe(node, t, head, llnode) { if (!is_duplicate_mce_record(node, t)) llist_add(&node->llnode, &new_head); } return new_head.first; } void mce_gen_pool_process(struct work_struct *__unused) { struct mce_evt_llist *node, *tmp; struct llist_node *head; struct mce *mce; head = llist_del_all(&mce_event_llist); if (!head) return; head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { mce = &node->err.m; blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } } bool mce_gen_pool_empty(void) { return llist_empty(&mce_event_llist); } bool mce_gen_pool_add(struct mce_hw_err *err) { struct mce_evt_llist *node; if (filter_mce(&err->m)) return false; if (!mce_evt_pool) return false; node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); if (!node) { pr_warn_ratelimited("MCE records pool full!\n"); return false; } memcpy(&node->err, err, sizeof(*err)); llist_add(&node->llnode, &mce_event_llist); return true; } static bool mce_gen_pool_create(void) { int mce_numrecords, mce_poolsz, order; struct gen_pool *gpool; void *mce_pool; order = order_base_2(sizeof(struct mce_evt_llist)); gpool = gen_pool_create(order, -1); if (!gpool) return false; mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU); mce_poolsz = mce_numrecords * (1 << order); mce_pool = kmalloc(mce_poolsz, GFP_KERNEL); if (!mce_pool) { gen_pool_destroy(gpool); return false; } if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) { gen_pool_destroy(gpool); kfree(mce_pool); return false; } mce_evt_pool = gpool; return true; } bool mce_gen_pool_init(void) { /* Just init mce_gen_pool once. */ if (mce_evt_pool) return true; return mce_gen_pool_create(); }
21 21 21 1 21 21 21 21 511 510 1 18 78 30 18 18 77 78 78 77 77 78 12 78 12 75 20 20 142 20 142 1 1 141 20 20 20 77 78 2 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux network device link state notification * * Author: * Stefan Rompf <sux@loplof.de> */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/if.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <linux/rtnetlink.h> #include <linux/jiffies.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/types.h> #include "dev.h" enum lw_bits { LW_URGENT = 0, }; static unsigned long linkwatch_flags; static unsigned long linkwatch_nextevent; static void linkwatch_event(struct work_struct *dummy); static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); static unsigned int default_operstate(const struct net_device *dev) { if (netif_testing(dev)) return IF_OPER_TESTING; /* Some uppers (DSA) have additional sources for being down, so * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { struct net_device *peer; int iflink; /* If called from netdev_run_todo()/linkwatch_sync_dev(), * dev_net(dev) can be already freed, and RTNL is not held. */ if (dev->reg_state <= NETREG_REGISTERED) iflink = dev_get_iflink(dev); else iflink = dev->ifindex; if (iflink == dev->ifindex) return IF_OPER_DOWN; ASSERT_RTNL(); peer = __dev_get_by_index(dev_net(dev), iflink); if (!peer) return IF_OPER_DOWN; return netif_carrier_ok(peer) ? IF_OPER_DOWN : IF_OPER_LOWERLAYERDOWN; } if (netif_dormant(dev)) return IF_OPER_DORMANT; return IF_OPER_UP; } static void rfc2863_policy(struct net_device *dev) { unsigned int operstate = default_operstate(dev); if (operstate == READ_ONCE(dev->operstate)) return; switch(dev->link_mode) { case IF_LINK_MODE_TESTING: if (operstate == IF_OPER_UP) operstate = IF_OPER_TESTING; break; case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; case IF_LINK_MODE_DEFAULT: default: break; } WRITE_ONCE(dev->operstate, operstate); } void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ if (!netif_carrier_ok(dev) || netif_dormant(dev) || netif_testing(dev)) rfc2863_policy(dev); } static bool linkwatch_urgent_event(struct net_device *dev) { if (!netif_running(dev)) return false; if (dev->ifindex != dev_get_iflink(dev)) return true; if (netif_is_lag_port(dev) || netif_is_lag_master(dev)) return true; return netif_carrier_ok(dev) && qdisc_tx_changing(dev); } static void linkwatch_add_event(struct net_device *dev) { unsigned long flags; spin_lock_irqsave(&lweventlist_lock, flags); if (list_empty(&dev->link_watch_list)) { list_add_tail(&dev->link_watch_list, &lweventlist); netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC); } spin_unlock_irqrestore(&lweventlist_lock, flags); } static void linkwatch_schedule_work(int urgent) { unsigned long delay = linkwatch_nextevent - jiffies; if (test_bit(LW_URGENT, &linkwatch_flags)) return; /* Minimise down-time: drop delay for up event. */ if (urgent) { if (test_and_set_bit(LW_URGENT, &linkwatch_flags)) return; delay = 0; } /* If we wrap around we'll delay it by at most HZ. */ if (delay > HZ) delay = 0; /* * If urgent, schedule immediate execution; otherwise, don't * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) mod_delayed_work(system_dfl_wq, &linkwatch_work, 0); else queue_delayed_work(system_dfl_wq, &linkwatch_work, delay); } static void linkwatch_do_dev(struct net_device *dev) { /* * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted */ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); rfc2863_policy(dev); if (dev->flags & IFF_UP) { if (netif_carrier_ok(dev)) dev_activate(dev); else dev_deactivate(dev); netif_state_change(dev); } /* Note: our callers are responsible for calling netdev_tracker_free(). * This is the reason we use __dev_put() instead of dev_put(). */ __dev_put(dev); } static void __linkwatch_run_queue(int urgent_only) { #define MAX_DO_DEV_PER_LOOP 100 int do_dev = MAX_DO_DEV_PER_LOOP; /* Use a local list here since we add non-urgent * events back to the global one when called with * urgent_only=1. */ LIST_HEAD(wrk); /* Give urgent case more budget */ if (urgent_only) do_dev += MAX_DO_DEV_PER_LOOP; /* * Limit the number of linkwatch events to one * per second so that a runaway driver does not * cause a storm of messages on the netlink * socket. This limit does not apply to up events * while the device qdisc is down. */ if (!urgent_only) linkwatch_nextevent = jiffies + HZ; /* Limit wrap-around effect on delay. */ else if (time_after(linkwatch_nextevent, jiffies + HZ)) linkwatch_nextevent = jiffies; clear_bit(LW_URGENT, &linkwatch_flags); spin_lock_irq(&lweventlist_lock); list_splice_init(&lweventlist, &wrk); while (!list_empty(&wrk) && do_dev > 0) { struct net_device *dev; dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); if (!netif_device_present(dev) || (urgent_only && !linkwatch_urgent_event(dev))) { list_add_tail(&dev->link_watch_list, &lweventlist); continue; } /* We must free netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); spin_unlock_irq(&lweventlist_lock); netdev_lock_ops(dev); linkwatch_do_dev(dev); netdev_unlock_ops(dev); do_dev--; spin_lock_irq(&lweventlist_lock); } /* Add the remaining work back to lweventlist */ list_splice_init(&wrk, &lweventlist); if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); spin_unlock_irq(&lweventlist_lock); } static bool linkwatch_clean_dev(struct net_device *dev) { unsigned long flags; bool clean = false; spin_lock_irqsave(&lweventlist_lock, flags); if (!list_empty(&dev->link_watch_list)) { list_del_init(&dev->link_watch_list); clean = true; /* We must release netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); } spin_unlock_irqrestore(&lweventlist_lock, flags); return clean; } void __linkwatch_sync_dev(struct net_device *dev) { netdev_ops_assert_locked(dev); if (linkwatch_clean_dev(dev)) linkwatch_do_dev(dev); } void linkwatch_sync_dev(struct net_device *dev) { if (linkwatch_clean_dev(dev)) { netdev_lock_ops(dev); linkwatch_do_dev(dev); netdev_unlock_ops(dev); } } /* Must be called with the rtnl semaphore held */ void linkwatch_run_queue(void) { __linkwatch_run_queue(0); } static void linkwatch_event(struct work_struct *dummy) { rtnl_lock(); __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies)); rtnl_unlock(); } void linkwatch_fire_event(struct net_device *dev) { bool urgent = linkwatch_urgent_event(dev); if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { linkwatch_add_event(dev); } else if (!urgent) return; linkwatch_schedule_work(urgent); } EXPORT_SYMBOL(linkwatch_fire_event);
8 12 11 10 9 12 8 8 1 5 5 3 2 2 15 15 15 10 10 8 3 3 3 3 3 8 8 8 8 11 11 11 11 2 2 11 11 11 11 11 11 11 11 2 2 2 11 11 11 11 11 11 1 10 10 10 10 11 3 3 3 3 4 2 2 1 1 2 3 2 5 5 6 7 2 5 3 2 2 1 2 3 2 2 1 3 13 13 12 11 11 10 9 2 10 10 1 10 1 1 2 13 16 15 1 14 3 11 1 4 18 5 13 18 17 1 16 16 1 15 15 14 13 12 15 16 18 2 7 9 9 1 1 1 7 7 2 1 2 2 2 2 10 19 18 17 15 19 14 10 10 10 8 8 8 2 5 5 5 5 5 1 2 1 1 11 12 4 3 2 5 5 5 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/net.h> #include <linux/workqueue.h> #include <linux/skmsg.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/sock_diag.h> #include <net/udp.h> struct bpf_stab { struct bpf_map map; struct sock **sks; struct sk_psock_progs progs; spinlock_t lock; }; #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) /* This mutex is used to * - protect race between prog/link attach/detach and link prog update, and * - protect race between releasing and accessing map in bpf_link. * A single global mutex lock is used since it is expected contention is low. */ static DEFINE_MUTEX(sockmap_mutex); static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, struct bpf_link *link, u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; if (attr->max_entries == 0 || attr->key_size != 4 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); spin_lock_init(&stab->lock); stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { bpf_map_area_free(stab); return ERR_PTR(-ENOMEM); } return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_map *map; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); mutex_unlock(&sockmap_mutex); return ret; } int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { struct bpf_prog *prog; struct bpf_map *map; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); prog = bpf_prog_get(attr->attach_bpf_fd); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->type != ptype) { ret = -EINVAL; goto put_prog; } mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); return ret; } static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); rcu_read_lock(); } static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); release_sock(sk); } static void sock_map_add_link(struct sk_psock *psock, struct sk_psock_link *link, struct bpf_map *map, void *link_raw) { link->link_raw = link_raw; link->map = map; spin_lock_bh(&psock->link_lock); list_add_tail(&link->list, &psock->link); spin_unlock_bh(&psock->link_lock); } static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; struct sk_psock_progs *progs = sock_map_progs(map); if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); break; } } spin_unlock_bh(&psock->link_lock); if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); if (strp_stop) sk_psock_stop_strp(sk, psock); if (verdict_stop) sk_psock_stop_verdict(sk, psock); if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, false); write_unlock_bh(&sk->sk_callback_lock); } } static void sock_map_unref(struct sock *sk, void *link_raw) { struct sk_psock *psock = sk_psock(sk); if (likely(psock)) { sock_map_del_link(sk, psock, link_raw); sk_psock_put(sk, psock); } } static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) { if (sk->sk_prot->close != sock_map_close) { psock = ERR_PTR(-EBUSY); goto out; } if (!refcount_inc_not_zero(&psock->refcnt)) psock = ERR_PTR(-EBUSY); } out: rcu_read_unlock(); return psock; } static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); if (IS_ERR(stream_verdict)) return PTR_ERR(stream_verdict); } stream_parser = READ_ONCE(progs->stream_parser); if (stream_parser) { stream_parser = bpf_prog_inc_not_zero(stream_parser); if (IS_ERR(stream_parser)) { ret = PTR_ERR(stream_parser); goto out_put_stream_verdict; } } msg_parser = READ_ONCE(progs->msg_parser); if (msg_parser) { msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); goto out_put_stream_parser; } } skb_verdict = READ_ONCE(progs->skb_verdict); if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) { ret = PTR_ERR(skb_verdict); goto out_put_msg_parser; } } psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; } } else { psock = sk_psock_init(sk, map->numa_node); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } } if (msg_parser) psock_set_prog(&psock->progs.msg_parser, msg_parser); if (stream_parser) psock_set_prog(&psock->progs.stream_parser, stream_parser); if (stream_verdict) psock_set_prog(&psock->progs.stream_verdict, stream_verdict); if (skb_verdict) psock_set_prog(&psock->progs.skb_verdict, skb_verdict); /* msg_* and stream_* programs references tracked in psock after this * point. Reference dec and cleanup will occur through psock destructor */ ret = sock_map_init_proto(sk, psock); if (ret < 0) { sk_psock_put(sk, psock); goto out; } write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { if (sk_is_tcp(sk)) ret = sk_psock_init_strp(sk, psock); else ret = -EOPNOTSUPP; if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); goto out; } sk_psock_start_strp(sk, psock); } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk,psock); } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; out_progs: if (skb_verdict) bpf_prog_put(skb_verdict); out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: if (stream_parser) bpf_prog_put(stream_parser); out_put_stream_verdict: if (stream_verdict) bpf_prog_put(stream_verdict); out: return ret; } static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; sk = xchg(psk, NULL); if (sk) { sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); sock_put(sk); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); bpf_map_area_free(stab); } static void sock_map_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); } static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(key >= map->max_entries)) return NULL; return READ_ONCE(stab->sks[key]); } static void *sock_map_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { struct sock *sk = NULL; int err = 0; spin_lock_bh(&stab->lock); if (!sk_test || sk_test == *psk) sk = xchg(psk, NULL); if (likely(sk)) sock_map_unref(sk, psk); else err = -EINVAL; spin_unlock_bh(&stab->lock); return err; } static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); __sock_map_delete(stab, sk, link_raw); } static long sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = *(u32 *)key; struct sock **psk; if (unlikely(i >= map->max_entries)) return -EINVAL; psk = &stab->sks[i]; return __sock_map_delete(stab, NULL, psk); } static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = key ? *(u32 *)key : U32_MAX; u32 *key_next = next; if (i == stab->map.max_entries - 1) return -ENOENT; if (i >= stab->map.max_entries) *key_next = 0; else *key_next = i + 1; return 0; } static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); spin_lock_bh(&stab->lock); osk = stab->sks[idx]; if (osk && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!osk && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); spin_unlock_bh(&stab->lock); return 0; out_unlock: spin_unlock_bh(&stab->lock); if (psock) sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) { return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return sk->sk_state != TCP_LISTEN; else return sk->sk_state == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) { return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; if (sk_is_vsock(sk) && (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags) { struct socket *sock; struct sock *sk; int ret; u64 ufd; if (map->value_size == sizeof(u64)) ufd = *(u64 *)value; else ufd = *(u32 *)value; if (ufd > S32_MAX) return -EINVAL; sock = sockfd_lookup(ufd, &ret); if (!sock) return ret; sk = sock->sk; if (!sk) { ret = -EINVAL; goto out; } if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: sockfd_put(sock); return ret; } static long sock_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct sock *sk = (struct sock *)value; int ret; if (unlikely(!sk || !sk_fullsock(sk))) return -EINVAL; if (!sock_map_sk_is_suitable(sk)) return -EOPNOTSUPP; local_bh_disable(); bh_lock_sock(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); bh_unlock_sock(sk); local_bh_enable(); return ret; } BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_map_update_common(map, *(u32 *)key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_map_update_proto = { .func = bpf_sock_map_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; if (sk_is_vsock(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_map_proto = { .func = bpf_msg_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; struct sock_map_seq_info { struct bpf_map *map; struct sock *sk; u32 index; }; struct bpf_iter__sockmap { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(void *, key); __bpf_md_ptr(struct sock *, sk); }; DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, struct bpf_map *map, void *key, struct sock *sk) static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) { if (unlikely(info->index >= info->map->max_entries)) return NULL; info->sk = __sock_map_lookup_elem(info->map, info->index); /* can't return sk directly, since that might be NULL */ return info; } static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_map_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_map_seq_stop */ rcu_read_lock(); return sock_map_seq_lookup_elem(info); } static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; ++*pos; ++info->index; return sock_map_seq_lookup_elem(info); } static int sock_map_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !v); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (v) { ctx.key = &info->index; ctx.sk = info->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_map_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_map_seq_show(seq, NULL); /* pairs with sock_map_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_map_seq_ops = { .start = sock_map_seq_start, .next = sock_map_seq_next, .stop = sock_map_seq_stop, .show = sock_map_seq_show, }; static int sock_map_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_map_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; return 0; } static void sock_map_fini_seq_private(void *priv_data) { struct sock_map_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_stab); usage += (u64)map->max_entries * sizeof(struct sock *); return usage; } static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, .fini_seq_private = sock_map_fini_seq_private, .seq_priv_size = sizeof(struct sock_map_seq_info), }; BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, .map_lookup_elem_sys_only = sock_map_lookup_sys, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_map_mem_usage, .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { struct rcu_head rcu; u32 hash; struct sock *sk; struct hlist_node node; u8 key[]; }; struct bpf_shtab_bucket { struct hlist_head head; spinlock_t lock; }; struct bpf_shtab { struct bpf_map map; struct bpf_shtab_bucket *buckets; u32 buckets_num; u32 elem_size; struct sk_psock_progs progs; atomic_t count; }; static inline u32 sock_hash_bucket_hash(const void *key, u32 len) { return jhash(key, len, 0); } static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, u32 hash) { return &htab->buckets[hash & (htab->buckets_num - 1)]; } static struct bpf_shtab_elem * sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { struct bpf_shtab_elem *elem; hlist_for_each_entry_rcu(elem, head, node) { if (elem->hash == hash && !memcmp(&elem->key, key, key_size)) return elem; } return NULL; } static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; WARN_ON_ONCE(!rcu_read_lock_held()); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); return elem ? elem->sk : NULL; } static void sock_hash_free_elem(struct bpf_shtab *htab, struct bpf_shtab_elem *elem) { atomic_dec(&htab->count); kfree_rcu(elem, rcu); } static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem_probe, *elem = link_raw; struct bpf_shtab_bucket *bucket; WARN_ON_ONCE(!rcu_read_lock_held()); bucket = sock_hash_select_bucket(htab, elem->hash); /* elem may be deleted in parallel from the map, but access here * is okay since it's going away only after RCU grace period. * However, we need to check whether it's still present. */ spin_lock_bh(&bucket->lock); elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, elem->key, map->key_size); if (elem_probe && elem_probe == elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); } static long sock_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; int ret = -ENOENT; hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); ret = 0; } spin_unlock_bh(&bucket->lock); return ret; } static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, void *key, u32 key_size, u32 hash, struct sock *sk, struct bpf_shtab_elem *old) { struct bpf_shtab_elem *new; if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (!old) { atomic_dec(&htab->count); return ERR_PTR(-E2BIG); } } new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); } memcpy(new->key, key, key_size); new->sk = sk; new->hash = hash; return new; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_elem *elem, *elem_new; struct bpf_shtab_bucket *bucket; struct sk_psock_link *link; struct sk_psock *psock; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!elem && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); if (IS_ERR(elem_new)) { ret = PTR_ERR(elem_new); goto out_unlock; } sock_map_add_link(psock, link, map, elem_new); /* Add new element to the head of the list, so that * concurrent search will find it before old elem. */ hlist_add_head_rcu(&elem_new->node, &bucket->head); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); return 0; out_unlock: spin_unlock_bh(&bucket->lock); sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem, *elem_next; u32 hash, key_size = map->key_size; struct hlist_head *head; int i = 0; if (!key) goto find_first_elem; hash = sock_hash_bucket_hash(key, key_size); head = &sock_hash_select_bucket(htab, hash)->head; elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); if (!elem) goto find_first_elem; elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } i = hash & (htab->buckets_num - 1); i++; find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } } return -ENOENT; } static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; if (attr->max_entries == 0 || attr->key_size == 0 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&htab->map, attr); htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct bpf_shtab_elem) + round_up(htab->map.key_size, 8); if (htab->buckets_num == 0 || htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { err = -EINVAL; goto free_htab; } htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { err = -ENOMEM; goto free_htab; } for (i = 0; i < htab->buckets_num; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); spin_lock_init(&htab->buckets[i].lock); } return &htab->map; free_htab: bpf_map_area_free(htab); return ERR_PTR(err); } static void sock_hash_free(struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_bucket *bucket; struct hlist_head unlink_list; struct bpf_shtab_elem *elem; struct hlist_node *node; int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); /* We are racing with sock_hash_delete_from_link to * enter the spin-lock critical section. Every socket on * the list is still linked to sockhash. Since link * exists, psock exists and holds a ref to socket. That * lets us to grab a socket ref too. */ spin_lock_bh(&bucket->lock); hlist_for_each_entry(elem, &bucket->head, node) sock_hold(elem->sk); hlist_move_list(&bucket->head, &unlink_list); spin_unlock_bh(&bucket->lock); /* Process removed entries out of atomic context to * block for socket lock before deleting the psock's * link to sockhash. */ hlist_for_each_entry_safe(elem, node, &unlink_list, node) { hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); sock_put(elem->sk); sock_hash_free_elem(htab, elem); } cond_resched(); } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(htab->buckets); bpf_map_area_free(htab); } static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_hash_lookup_elem(map, key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static void *sock_hash_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_hash_lookup_elem(map, key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void sock_hash_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); } BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_hash_update_common(map, key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_hash_update_proto = { .func = bpf_sock_hash_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_hash_proto = { .func = bpf_sk_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; if (sk_is_vsock(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .func = bpf_msg_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; struct sock_hash_seq_info { struct bpf_map *map; struct bpf_shtab *htab; u32 bucket_id; }; static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, struct bpf_shtab_elem *prev_elem) { const struct bpf_shtab *htab = info->htab; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; struct hlist_node *node; /* try to find next elem in the same bucket */ if (prev_elem) { node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; /* no more elements, continue in the next bucket */ info->bucket_id++; } for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { bucket = &htab->buckets[info->bucket_id]; node = rcu_dereference(hlist_first_rcu(&bucket->head)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; } return NULL; } static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_hash_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_hash_seq_stop */ rcu_read_lock(); return sock_hash_seq_find_next(info, NULL); } static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; ++*pos; return sock_hash_seq_find_next(info, v); } static int sock_hash_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_shtab_elem *elem = v; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !elem); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (elem) { ctx.key = elem->key; ctx.sk = elem->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_hash_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_hash_seq_show(seq, NULL); /* pairs with sock_hash_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_hash_seq_ops = { .start = sock_hash_seq_start, .next = sock_hash_seq_next, .stop = sock_hash_seq_stop, .show = sock_hash_seq_show, }; static int sock_hash_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_hash_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; info->htab = container_of(aux->map, struct bpf_shtab, map); return 0; } static void sock_hash_fini_seq_private(void *priv_data) { struct sock_hash_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_hash_mem_usage(const struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u64 usage = sizeof(*htab); usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket); usage += atomic_read(&htab->count) * (u64)htab->elem_size; return usage; } static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, .fini_seq_private = sock_hash_fini_seq_private, .seq_priv_size = sizeof(struct sock_hash_seq_info), }; BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_lookup_elem = sock_hash_lookup, .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_hash_mem_usage, .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) { switch (map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return &container_of(map, struct bpf_stab, map)->progs; case BPF_MAP_TYPE_SOCKHASH: return &container_of(map, struct bpf_shtab, map)->progs; default: break; } return NULL; } static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **cur_pprog; struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: cur_pprog = &progs->msg_parser; cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: cur_pprog = &progs->stream_parser; cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; cur_pprog = &progs->stream_verdict; cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; cur_pprog = &progs->skb_verdict; cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } *pprog = cur_pprog; if (plink) *plink = cur_plink; return 0; } /* Handle the following four cases: * prog_attach: prog != NULL, old == NULL, link == NULL * prog_detach: prog == NULL, old != NULL, link == NULL * link_attach: prog != NULL, old == NULL, link != NULL * link_detach: prog == NULL, old != NULL, link != NULL */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, struct bpf_link *link, u32 which) { struct bpf_prog **pprog; struct bpf_link **plink; int ret; ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; /* for prog_attach/prog_detach/link_attach, return error if a bpf_link * exists for that prog. */ if ((!link || prog) && *plink) return -EBUSY; if (old) { ret = psock_replace_prog(pprog, prog, old); if (!ret) *plink = NULL; } else { psock_set_prog(pprog, prog); if (link) *plink = link; } return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_cnt = 0, flags = 0; struct bpf_prog **pprog; struct bpf_prog *prog; struct bpf_map *map; u32 id = 0; int ret; if (attr->query.query_flags) return -EINVAL; CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); rcu_read_lock(); ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; prog = *pprog; prog_cnt = !prog ? 0 : 1; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) goto end; /* we do not hold the refcnt, the bpf prog may be released * asynchronously and the id would be set to 0. */ id = data_race(prog->aux->id); if (id == 0) prog_cnt = 0; end: rcu_read_unlock(); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) ret = -EFAULT; return ret; } static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return sock_map_delete_from_link(link->map, sk, link->link_raw); case BPF_MAP_TYPE_SOCKHASH: return sock_hash_delete_from_link(link->map, sk, link->link_raw); default: break; } } static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) { struct sk_psock_link *link; while ((link = sk_psock_link_pop(psock))) { sock_map_unlink(sk, link); sk_psock_free_link(link); } } void sock_map_unhash(struct sock *sk) { void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); } if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) return; if (saved_unhash) saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_destroy(struct sock *sk) { void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); } if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) return; if (saved_destroy) saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); void sock_map_close(struct sock *sk, long timeout) { void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; lock_sock(sk); rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); psock = sk_psock_get(sk); if (unlikely(!psock)) goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } else { saved_close = READ_ONCE(sk->sk_prot)->close; no_psock: rcu_read_unlock(); release_sock(sk); } /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */ if (WARN_ON_ONCE(saved_close == sock_map_close)) return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); struct sockmap_link { struct bpf_link link; struct bpf_map *map; }; static void sock_map_link_release(struct bpf_link *link) { struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); mutex_lock(&sockmap_mutex); if (!sockmap_link->map) goto out; WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, link->attach_type)); bpf_map_put_with_uref(sockmap_link->map); sockmap_link->map = NULL; out: mutex_unlock(&sockmap_mutex); } static int sock_map_link_detach(struct bpf_link *link) { sock_map_link_release(link); return 0; } static void sock_map_link_dealloc(struct bpf_link *link) { kfree(link); } /* Handle the following two cases: * case 1: link != NULL, prog != NULL, old != NULL * case 2: link != NULL, prog != NULL, old == NULL */ static int sock_map_link_update_prog(struct bpf_link *link, struct bpf_prog *prog, struct bpf_prog *old) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); struct bpf_prog **pprog, *old_link_prog; struct bpf_link **plink; int ret = 0; mutex_lock(&sockmap_mutex); /* If old prog is not NULL, ensure old prog is the same as link->prog. */ if (old && link->prog != old) { ret = -EPERM; goto out; } /* Ensure link->prog has the same type/attach_type as the new prog. */ if (link->prog->type != prog->type || link->prog->expected_attach_type != prog->expected_attach_type) { ret = -EINVAL; goto out; } if (!sockmap_link->map) { ret = -ENOLINK; goto out; } ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, link->attach_type); if (ret) goto out; /* return error if the stored bpf_link does not match the incoming bpf_link. */ if (link != *plink) { ret = -EBUSY; goto out; } if (old) { ret = psock_replace_prog(pprog, prog, old); if (ret) goto out; } else { psock_set_prog(pprog, prog); } bpf_prog_inc(prog); old_link_prog = xchg(&link->prog, prog); bpf_prog_put(old_link_prog); out: mutex_unlock(&sockmap_mutex); return ret; } static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) { u32 map_id = 0; mutex_lock(&sockmap_mutex); if (sockmap_link->map) map_id = sockmap_link->map->id; mutex_unlock(&sockmap_mutex); return map_id; } static int sock_map_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); u32 map_id = sock_map_link_get_map_id(sockmap_link); info->sockmap.map_id = map_id; info->sockmap.attach_type = link->attach_type; return 0; } static void sock_map_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); u32 map_id = sock_map_link_get_map_id(sockmap_link); seq_printf(seq, "map_id:\t%u\n", map_id); seq_printf(seq, "attach_type:\t%u\n", link->attach_type); } static const struct bpf_link_ops sock_map_link_ops = { .release = sock_map_link_release, .dealloc = sock_map_link_dealloc, .detach = sock_map_link_detach, .update_prog = sock_map_link_update_prog, .fill_link_info = sock_map_link_fill_info, .show_fdinfo = sock_map_link_show_fdinfo, }; int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct sockmap_link *sockmap_link; enum bpf_attach_type attach_type; struct bpf_map *map; int ret; if (attr->link_create.flags) return -EINVAL; map = bpf_map_get_with_uref(attr->link_create.target_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { ret = -EINVAL; goto out; } sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); if (!sockmap_link) { ret = -ENOMEM; goto out; } attach_type = attr->link_create.attach_type; bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog, attach_type); sockmap_link->map = map; ret = bpf_link_prime(&sockmap_link->link, &link_primer); if (ret) { kfree(sockmap_link); goto out; } mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); mutex_unlock(&sockmap_mutex); if (ret) { bpf_link_cleanup(&link_primer); goto out; } /* Increase refcnt for the prog since when old prog is replaced with * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. * * Actually, we do not need to increase refcnt for the prog since bpf_link * will hold a reference. But in order to have less complexity w.r.t. * replacing/setting prog, let us increase the refcnt to make things simpler. */ bpf_prog_inc(prog); return bpf_link_settle(&link_primer); out: bpf_map_put_with_uref(map); return ret; } static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) { struct bpf_map *map; int err = -EINVAL; if (!linfo->map.map_fd) return -EBADF; map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) goto put_map; if (prog->aux->max_rdonly_access > map->key_size) { err = -EACCES; goto put_map; } aux->map = map; return 0; put_map: bpf_map_put_with_uref(map); return err; } static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) { bpf_map_put_with_uref(aux->map); } static struct bpf_iter_reg sock_map_iter_reg = { .target = "sockmap", .attach_target = sock_map_iter_attach_target, .detach_target = sock_map_iter_detach_target, .show_fdinfo = bpf_iter_map_show_fdinfo, .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, }; static int __init bpf_sockmap_iter_init(void) { sock_map_iter_reg.ctx_arg_info[1].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK]; return bpf_iter_reg_target(&sock_map_iter_reg); } late_initcall(bpf_sockmap_iter_init);
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016-present, Facebook, Inc. * All rights reserved. * */ #include <linux/bio.h> #include <linux/bitmap.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/pagemap.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/zstd.h> #include "misc.h" #include "fs.h" #include "btrfs_inode.h" #include "compression.h" #include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 #define ZSTD_BTRFS_MIN_LEVEL -15 #define ZSTD_BTRFS_MAX_LEVEL 15 /* 307s to avoid pathologically clashing with transaction commit */ #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) static zstd_parameters zstd_get_btrfs_parameters(int level, size_t src_len) { zstd_parameters params = zstd_get_params(level, src_len); if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; WARN_ON(src_len > ZSTD_BTRFS_MAX_INPUT); return params; } struct workspace { void *mem; size_t size; char *buf; int level; int req_level; unsigned long last_used; /* jiffies */ struct list_head list; struct list_head lru_list; zstd_in_buffer in_buf; zstd_out_buffer out_buf; zstd_parameters params; }; /* * Zstd Workspace Management * * Zstd workspaces have different memory requirements depending on the level. * The zstd workspaces are managed by having individual lists for each level * and a global lru. Forward progress is maintained by protecting a max level * workspace. * * Getting a workspace is done by using the bitmap to identify the levels that * have available workspaces and scans up. This lets us recycle higher level * workspaces because of the monotonic memory guarantee. A workspace's * last_used is only updated if it is being used by the corresponding memory * level. Putting a workspace involves adding it back to the appropriate places * and adding it back to the lru if necessary. * * A timer is used to reclaim workspaces if they have not been used for * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around. * The upper bound is provided by the workqueue limit which is 2 (percpu limit). */ struct zstd_workspace_manager { spinlock_t lock; struct list_head lru_list; struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL]; unsigned long active_map; wait_queue_head_t wait; struct timer_list timer; }; static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL]; static inline struct workspace *list_to_workspace(struct list_head *list) { return container_of(list, struct workspace, list); } static inline int clip_level(int level) { return max(0, level - 1); } /* * Timer callback to free unused workspaces. * * @t: timer * * This scans the lru_list and attempts to reclaim any workspace that hasn't * been used for ZSTD_BTRFS_RECLAIM_JIFFIES. * * The context is softirq and does not need the _bh locking primitives. */ static void zstd_reclaim_timer_fn(struct timer_list *timer) { struct zstd_workspace_manager *zwsm = container_of(timer, struct zstd_workspace_manager, timer); unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; spin_lock(&zwsm->lock); if (list_empty(&zwsm->lru_list)) { spin_unlock(&zwsm->lock); return; } list_for_each_prev_safe(pos, next, &zwsm->lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); int level; if (time_after(victim->last_used, reclaim_threshold)) break; /* workspace is in use */ if (victim->req_level) continue; level = victim->level; list_del(&victim->lru_list); list_del(&victim->list); zstd_free_workspace(&victim->list); if (list_empty(&zwsm->idle_ws[level])) clear_bit(level, &zwsm->active_map); } if (!list_empty(&zwsm->lru_list)) mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); spin_unlock(&zwsm->lock); } /* * Calculate monotonic memory bounds. * * It is possible based on the level configurations that a higher level * workspace uses less memory than a lower level workspace. In order to reuse * workspaces, this must be made a monotonic relationship. This precomputes * the required memory for each level and enforces the monotonicity between * level and memory required. */ static void zstd_calc_ws_mem_sizes(void) { size_t max_size = 0; int level; for (level = ZSTD_BTRFS_MIN_LEVEL; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { if (level == 0) continue; zstd_parameters params = zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); size_t level_size = max_t(size_t, zstd_cstream_workspace_bound(&params.cParams), zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT)); max_size = max_t(size_t, max_size, level_size); /* Use level 1 workspace size for all the fast mode negative levels. */ zstd_ws_mem_sizes[clip_level(level)] = max_size; } } int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info) { struct zstd_workspace_manager *zwsm; struct list_head *ws; ASSERT(fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] == NULL); zwsm = kzalloc(sizeof(*zwsm), GFP_KERNEL); if (!zwsm) return -ENOMEM; zstd_calc_ws_mem_sizes(); spin_lock_init(&zwsm->lock); init_waitqueue_head(&zwsm->wait); timer_setup(&zwsm->timer, zstd_reclaim_timer_fn, 0); INIT_LIST_HEAD(&zwsm->lru_list); for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&zwsm->idle_ws[i]); fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = zwsm; ws = zstd_alloc_workspace(fs_info, ZSTD_BTRFS_MAX_LEVEL); if (IS_ERR(ws)) { btrfs_warn(NULL, "cannot preallocate zstd compression workspace"); } else { set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &zwsm->active_map); list_add(ws, &zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); } return 0; } void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info) { struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct workspace *workspace; if (!zwsm) return; fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = NULL; spin_lock_bh(&zwsm->lock); for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { while (!list_empty(&zwsm->idle_ws[i])) { workspace = container_of(zwsm->idle_ws[i].next, struct workspace, list); list_del(&workspace->list); list_del(&workspace->lru_list); zstd_free_workspace(&workspace->list); } } spin_unlock_bh(&zwsm->lock); timer_delete_sync(&zwsm->timer); kfree(zwsm); } /* * Find workspace for given level. * * @level: compression level * * This iterates over the set bits in the active_map beginning at the requested * compression level. This lets us utilize already allocated workspaces before * allocating a new one. If the workspace is of a larger size, it is used, but * the place in the lru_list and last_used times are not updated. This is to * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ static struct list_head *zstd_find_workspace(struct btrfs_fs_info *fs_info, int level) { struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct list_head *ws; struct workspace *workspace; int i = clip_level(level); ASSERT(zwsm); spin_lock_bh(&zwsm->lock); for_each_set_bit_from(i, &zwsm->active_map, ZSTD_BTRFS_MAX_LEVEL) { if (!list_empty(&zwsm->idle_ws[i])) { ws = zwsm->idle_ws[i].next; workspace = list_to_workspace(ws); list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; if (clip_level(level) == workspace->level) list_del(&workspace->lru_list); if (list_empty(&zwsm->idle_ws[i])) clear_bit(i, &zwsm->active_map); spin_unlock_bh(&zwsm->lock); return ws; } } spin_unlock_bh(&zwsm->lock); return NULL; } /* * Zstd get_workspace for level. * * @level: compression level * * If @level is 0, then any compression level can be used. Therefore, we begin * scanning from 1. We first scan through possible workspaces and then after * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level) { struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct list_head *ws; unsigned int nofs_flag; ASSERT(zwsm); /* level == 0 means we can use any workspace */ if (!level) level = 1; again: ws = zstd_find_workspace(fs_info, level); if (ws) return ws; nofs_flag = memalloc_nofs_save(); ws = zstd_alloc_workspace(fs_info, level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ws)) { DEFINE_WAIT(wait); prepare_to_wait(&zwsm->wait, &wait, TASK_UNINTERRUPTIBLE); schedule(); finish_wait(&zwsm->wait, &wait); goto again; } return ws; } /* * Zstd put_workspace. * * @ws: list_head for the workspace * * When putting back a workspace, we only need to update the LRU if we are of * the requested compression level. Here is where we continue to protect the * max level workspace or update last_used accordingly. If the reclaim timer * isn't set, it is also set here. Only the max level workspace tries and wakes * up waiting workspaces. */ void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws) { struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct workspace *workspace = list_to_workspace(ws); ASSERT(zwsm); spin_lock_bh(&zwsm->lock); /* A node is only taken off the lru if we are the corresponding level */ if (clip_level(workspace->req_level) == workspace->level) { /* Hide a max level workspace from reclaim */ if (list_empty(&zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); } else { workspace->last_used = jiffies; list_add(&workspace->lru_list, &zwsm->lru_list); if (!timer_pending(&zwsm->timer)) mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); } } set_bit(workspace->level, &zwsm->active_map); list_add(&workspace->list, &zwsm->idle_ws[workspace->level]); workspace->req_level = 0; spin_unlock_bh(&zwsm->lock); if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL)) cond_wake_up(&zwsm->wait); } void zstd_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); kvfree(workspace->mem); kfree(workspace->buf); kfree(workspace); } struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level) { const u32 blocksize = fs_info->sectorsize; struct workspace *workspace; workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); /* Use level 1 workspace size for all the fast mode negative levels. */ workspace->size = zstd_ws_mem_sizes[clip_level(level)]; workspace->level = clip_level(level); workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); workspace->buf = kmalloc(blocksize, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; INIT_LIST_HEAD(&workspace->list); INIT_LIST_HEAD(&workspace->lru_list); return &workspace->list; fail: zstd_free_workspace(&workspace->list); return ERR_PTR(-ENOMEM); } int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); struct address_space *mapping = inode->vfs_inode.i_mapping; zstd_cstream *stream; int ret = 0; int nr_folios = 0; struct folio *in_folio = NULL; /* The current folio to read. */ struct folio *out_folio = NULL; /* The current folio to write to. */ unsigned long tot_in = 0; unsigned long tot_out = 0; unsigned long len = *total_out; const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; const u32 blocksize = fs_info->sectorsize; const u32 min_folio_size = btrfs_min_folio_size(fs_info); unsigned long max_out = nr_dest_folios * min_folio_size; unsigned int cur_len; workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); *out_folios = 0; *total_out = 0; *total_in = 0; /* Initialize the stream */ stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { btrfs_err(fs_info, "zstd compression init level %d failed, root %llu inode %llu offset %llu", workspace->req_level, btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; goto out; } /* map in the first page of input data */ ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; cur_len = btrfs_calc_input_length(in_folio, orig_end, start); workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; /* Allocate and map in the output buffer */ out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; } folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); while (1) { size_t ret2; ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret2))) { btrfs_warn(fs_info, "zstd compression level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; goto out; } /* Check to see if we are making it bigger */ if (tot_in + workspace->in_buf.pos > blocksize * 2 && tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) { ret = -E2BIG; goto out; } /* We've reached the end of our output range */ if (workspace->out_buf.pos >= max_out) { tot_out += workspace->out_buf.pos; ret = -E2BIG; goto out; } /* Check if we need more output space */ if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += min_folio_size; max_out -= min_folio_size; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; } folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); } /* We've reached the end of the input */ if (workspace->in_buf.pos >= len) { tot_in += workspace->in_buf.pos; break; } /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += workspace->in_buf.size; kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); start += cur_len; len -= cur_len; ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; cur_len = btrfs_calc_input_length(in_folio, orig_end, start); workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; } } while (1) { size_t ret2; ret2 = zstd_end_stream(stream, &workspace->out_buf); if (unlikely(zstd_is_error(ret2))) { btrfs_err(fs_info, "zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; goto out; } if (ret2 == 0) { tot_out += workspace->out_buf.pos; break; } if (workspace->out_buf.pos >= max_out) { tot_out += workspace->out_buf.pos; ret = -E2BIG; goto out; } tot_out += min_folio_size; max_out -= min_folio_size; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; } folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); } if (tot_out >= tot_in) { ret = -E2BIG; goto out; } ret = 0; *total_in = tot_in; *total_out = tot_out; out: *out_folios = nr_folios; if (workspace->in_buf.src) { kunmap_local(workspace->in_buf.src); folio_put(in_folio); } return ret; } int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); struct folio **folios_in = cb->compressed_folios; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; const u32 blocksize = fs_info->sectorsize; const unsigned int min_folio_size = btrfs_min_folio_size(fs_info); unsigned long folio_in_index = 0; unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); unsigned long buf_start; unsigned long total_out = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (unlikely(!stream)) { struct btrfs_inode *inode = cb->bbio.inode; btrfs_err(inode->root->fs_info, "zstd decompression init failed, root %llu inode %llu offset %llu", btrfs_root_id(inode->root), btrfs_ino(inode), cb->start); ret = -EIO; goto done; } workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; workspace->out_buf.size = blocksize; while (1) { size_t ret2; ret2 = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret2))) { struct btrfs_inode *inode = cb->bbio.inode; btrfs_err(inode->root->fs_info, "zstd decompression failed, error %d root %llu inode %llu offset %llu", zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), cb->start); ret = -EIO; goto done; } buf_start = total_out; total_out += workspace->out_buf.pos; workspace->out_buf.pos = 0; ret = btrfs_decompress_buf2page(workspace->out_buf.dst, total_out - buf_start, cb, buf_start); if (ret == 0) break; if (workspace->in_buf.pos >= srclen) break; /* Check if we've hit the end of a frame */ if (ret2 == 0) break; if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap_local(workspace->in_buf.src); folio_in_index++; if (unlikely(folio_in_index >= total_folios_in)) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= min_folio_size; workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); } } ret = 0; done: if (workspace->in_buf.src) kunmap_local(workspace->in_buf.src); return ret; } int zstd_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); struct btrfs_fs_info *fs_info = btrfs_sb(folio_inode(dest_folio)->i_sb); const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; unsigned long to_copy = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (unlikely(!stream)) { struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zstd decompression init failed, root %llu inode %llu offset %llu", btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(dest_folio)); ret = -EIO; goto finish; } workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; workspace->out_buf.size = sectorsize; /* * Since both input and output buffers should not exceed one sector, * one call should end the decompression. */ ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret))) { struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zstd decompression failed, error %d root %llu inode %llu offset %llu", zstd_get_error_code(ret), btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(dest_folio)); goto finish; } to_copy = workspace->out_buf.pos; memcpy_to_folio(dest_folio, dest_pgoff, workspace->out_buf.dst, to_copy); finish: /* Error or early end. */ if (unlikely(to_copy < destlen)) { ret = -EIO; folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); } return ret; } const struct btrfs_compress_levels btrfs_zstd_compress = { .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, };
60 61 61 802 801 800 207 802 68 30 3 410 411 68 67 512 512 411 411 455 454 455 357 99 31 3 3 3 3 455 455 454 455 360 3 455 8 8 5 5 5 5 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 5 5 5 5 5 5 4 5 1 5 7 4 88 4 4 4 4 4 4 4 3 4 4 4 3 3 4 4 4 4 4 4 4 4 4 511 512 411 411 411 33 411 411 411 411 60 1 61 206 800 799 800 797 672 512 598 7 88 61 206 9 13 43 601 600 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "hard-interface.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include "bat_v.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "log.h" #include "mesh-interface.h" #include "originator.h" #include "send.h" #include "translation-table.h" /** * batadv_hardif_release() - release hard interface from lists and queue for * free after rcu grace period * @ref: kref pointer of the hard interface */ void batadv_hardif_release(struct kref *ref) { struct batadv_hard_iface *hard_iface; hard_iface = container_of(ref, struct batadv_hard_iface, refcount); netdev_put(hard_iface->net_dev, &hard_iface->dev_tracker); kfree_rcu(hard_iface, rcu); } /** * batadv_hardif_get_by_netdev() - Get hard interface object of a net_device * @net_dev: net_device to search for * * Return: batadv_hard_iface of net_dev (with increased refcnt), NULL on errors */ struct batadv_hard_iface * batadv_hardif_get_by_netdev(const struct net_device *net_dev) { struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->net_dev == net_dev && kref_get_unless_zero(&hard_iface->refcount)) goto out; } hard_iface = NULL; out: rcu_read_unlock(); return hard_iface; } /** * batadv_getlink_net() - return link net namespace (of use fallback) * @netdev: net_device to check * @fallback_net: return in case get_link_net is not available for @netdev * * Return: result of rtnl_link_ops->get_link_net or @fallback_net */ static struct net *batadv_getlink_net(const struct net_device *netdev, struct net *fallback_net) { if (!netdev->rtnl_link_ops) return fallback_net; if (!netdev->rtnl_link_ops->get_link_net) return fallback_net; return netdev->rtnl_link_ops->get_link_net(netdev); } /** * batadv_mutual_parents() - check if two devices are each others parent * @dev1: 1st net dev * @net1: 1st devices netns * @dev2: 2nd net dev * @net2: 2nd devices netns * * veth devices come in pairs and each is the parent of the other! * * Return: true if the devices are each others parent, otherwise false */ static bool batadv_mutual_parents(const struct net_device *dev1, struct net *net1, const struct net_device *dev2, struct net *net2) { int dev1_parent_iflink = dev_get_iflink(dev1); int dev2_parent_iflink = dev_get_iflink(dev2); const struct net *dev1_parent_net; const struct net *dev2_parent_net; dev1_parent_net = batadv_getlink_net(dev1, net1); dev2_parent_net = batadv_getlink_net(dev2, net2); if (!dev1_parent_iflink || !dev2_parent_iflink) return false; return (dev1_parent_iflink == dev2->ifindex) && (dev2_parent_iflink == dev1->ifindex) && net_eq(dev1_parent_net, net2) && net_eq(dev2_parent_net, net1); } /** * batadv_is_on_batman_iface() - check if a device is a batman iface descendant * @net_dev: the device to check * * If the user creates any virtual device on top of a batman-adv interface, it * is important to prevent this new interface from being used to create a new * mesh network (this behaviour would lead to a batman-over-batman * configuration). This function recursively checks all the fathers of the * device passed as argument looking for a batman-adv mesh interface. * * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise */ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) { struct net *net = dev_net(net_dev); struct net_device *parent_dev; struct net *parent_net; int iflink; bool ret; /* check if this is a batman-adv mesh interface */ if (batadv_meshif_is_valid(net_dev)) return true; iflink = dev_get_iflink(net_dev); if (iflink == 0) return false; parent_net = batadv_getlink_net(net_dev, net); /* iflink to itself, most likely physical device */ if (net == parent_net && iflink == net_dev->ifindex) return false; /* recurse over the parent device */ parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); if (!parent_dev) { pr_warn("Cannot find parent device. Skipping batadv-on-batadv check for %s\n", net_dev->name); return false; } if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net)) return false; ret = batadv_is_on_batman_iface(parent_dev); return ret; } static bool batadv_is_valid_iface(const struct net_device *net_dev) { if (net_dev->flags & IFF_LOOPBACK) return false; if (net_dev->type != ARPHRD_ETHER) return false; if (net_dev->addr_len != ETH_ALEN) return false; /* no batman over batman */ if (batadv_is_on_batman_iface(net_dev)) return false; return true; } /** * batadv_get_real_netdevice() - check if the given netdev struct is a virtual * interface on top of another 'real' interface * @netdev: the device to check * * Callers must hold the rtnl semaphore. You may want batadv_get_real_netdev() * instead of this. * * Return: the 'real' net device or the original net device and NULL in case * of an error. */ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) { struct batadv_hard_iface *hard_iface = NULL; struct net_device *real_netdev = NULL; struct net *real_net; struct net *net; int iflink; ASSERT_RTNL(); if (!netdev) return NULL; iflink = dev_get_iflink(netdev); if (iflink == 0) { dev_hold(netdev); return netdev; } hard_iface = batadv_hardif_get_by_netdev(netdev); if (!hard_iface || !hard_iface->mesh_iface) goto out; net = dev_net(hard_iface->mesh_iface); real_net = batadv_getlink_net(netdev, net); /* iflink to itself, most likely physical device */ if (net == real_net && netdev->ifindex == iflink) { real_netdev = netdev; dev_hold(real_netdev); goto out; } real_netdev = dev_get_by_index(real_net, iflink); out: batadv_hardif_put(hard_iface); return real_netdev; } /** * batadv_get_real_netdev() - check if the given net_device struct is a virtual * interface on top of another 'real' interface * @net_device: the device to check * * Return: the 'real' net device or the original net device and NULL in case * of an error. */ struct net_device *batadv_get_real_netdev(struct net_device *net_device) { struct net_device *real_netdev; rtnl_lock(); real_netdev = batadv_get_real_netdevice(net_device); rtnl_unlock(); return real_netdev; } /** * batadv_is_wext_netdev() - check if the given net_device struct is a * wext wifi interface * @net_device: the device to check * * Return: true if the net device is a wext wireless device, false * otherwise. */ static bool batadv_is_wext_netdev(struct net_device *net_device) { if (!net_device) return false; #ifdef CONFIG_WIRELESS_EXT /* pre-cfg80211 drivers have to implement WEXT, so it is possible to * check for wireless_handlers != NULL */ if (net_device->wireless_handlers) return true; #endif return false; } /** * batadv_is_cfg80211_netdev() - check if the given net_device struct is a * cfg80211 wifi interface * @net_device: the device to check * * Return: true if the net device is a cfg80211 wireless device, false * otherwise. */ static bool batadv_is_cfg80211_netdev(struct net_device *net_device) { if (!net_device) return false; #if IS_ENABLED(CONFIG_CFG80211) /* cfg80211 drivers have to set ieee80211_ptr */ if (net_device->ieee80211_ptr) return true; #endif return false; } /** * batadv_wifi_flags_evaluate() - calculate wifi flags for net_device * @net_device: the device to check * * Return: batadv_hard_iface_wifi_flags flags of the device */ static u32 batadv_wifi_flags_evaluate(struct net_device *net_device) { u32 wifi_flags = 0; struct net_device *real_netdev; if (batadv_is_wext_netdev(net_device)) wifi_flags |= BATADV_HARDIF_WIFI_WEXT_DIRECT; if (batadv_is_cfg80211_netdev(net_device)) wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT; real_netdev = batadv_get_real_netdevice(net_device); if (!real_netdev) return wifi_flags; if (real_netdev == net_device) goto out; if (batadv_is_wext_netdev(real_netdev)) wifi_flags |= BATADV_HARDIF_WIFI_WEXT_INDIRECT; if (batadv_is_cfg80211_netdev(real_netdev)) wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT; out: dev_put(real_netdev); return wifi_flags; } /** * batadv_is_cfg80211_hardif() - check if the given hardif is a cfg80211 wifi * interface * @hard_iface: the device to check * * Return: true if the net device is a cfg80211 wireless device, false * otherwise. */ bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface) { u32 allowed_flags = 0; allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT; allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT; return !!(hard_iface->wifi_flags & allowed_flags); } /** * batadv_is_wifi_hardif() - check if the given hardif is a wifi interface * @hard_iface: the device to check * * Return: true if the net device is a 802.11 wireless device, false otherwise. */ bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface) { if (!hard_iface) return false; return hard_iface->wifi_flags != 0; } /** * batadv_hardif_no_broadcast() - check whether (re)broadcast is necessary * @if_outgoing: the outgoing interface checked and considered for (re)broadcast * @orig_addr: the originator of this packet * @orig_neigh: originator address of the forwarder we just got the packet from * (NULL if we originated) * * Checks whether a packet needs to be (re)broadcasted on the given interface. * * Return: * BATADV_HARDIF_BCAST_NORECIPIENT: No neighbor on interface * BATADV_HARDIF_BCAST_DUPFWD: Just one neighbor, but it is the forwarder * BATADV_HARDIF_BCAST_DUPORIG: Just one neighbor, but it is the originator * BATADV_HARDIF_BCAST_OK: Several neighbors, must broadcast */ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, u8 *orig_addr, u8 *orig_neigh) { struct batadv_hardif_neigh_node *hardif_neigh; struct hlist_node *first; int ret = BATADV_HARDIF_BCAST_OK; rcu_read_lock(); /* 0 neighbors -> no (re)broadcast */ first = rcu_dereference(hlist_first_rcu(&if_outgoing->neigh_list)); if (!first) { ret = BATADV_HARDIF_BCAST_NORECIPIENT; goto out; } /* >1 neighbors -> (re)broadcast */ if (rcu_dereference(hlist_next_rcu(first))) goto out; hardif_neigh = hlist_entry(first, struct batadv_hardif_neigh_node, list); /* 1 neighbor, is the originator -> no rebroadcast */ if (orig_addr && batadv_compare_eth(hardif_neigh->orig, orig_addr)) { ret = BATADV_HARDIF_BCAST_DUPORIG; /* 1 neighbor, is the one we received from -> no rebroadcast */ } else if (orig_neigh && batadv_compare_eth(hardif_neigh->orig, orig_neigh)) { ret = BATADV_HARDIF_BCAST_DUPFWD; } out: rcu_read_unlock(); return ret; } static struct batadv_hard_iface * batadv_hardif_get_active(struct net_device *mesh_iface) { struct batadv_hard_iface *hard_iface; struct list_head *iter; rcu_read_lock(); netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter) { if (hard_iface->if_status == BATADV_IF_ACTIVE && kref_get_unless_zero(&hard_iface->refcount)) goto out; } hard_iface = NULL; out: rcu_read_unlock(); return hard_iface; } static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv, struct batadv_hard_iface *oldif) { struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; batadv_dat_init_own_addr(bat_priv, primary_if); batadv_bla_update_orig_address(bat_priv, primary_if, oldif); out: batadv_hardif_put(primary_if); } static void batadv_primary_if_select(struct batadv_priv *bat_priv, struct batadv_hard_iface *new_hard_iface) { struct batadv_hard_iface *curr_hard_iface; ASSERT_RTNL(); if (new_hard_iface) kref_get(&new_hard_iface->refcount); curr_hard_iface = rcu_replace_pointer(bat_priv->primary_if, new_hard_iface, 1); if (!new_hard_iface) goto out; bat_priv->algo_ops->iface.primary_set(new_hard_iface); batadv_primary_if_update_addr(bat_priv, curr_hard_iface); out: batadv_hardif_put(curr_hard_iface); } static bool batadv_hardif_is_iface_up(const struct batadv_hard_iface *hard_iface) { if (hard_iface->net_dev->flags & IFF_UP) return true; return false; } static void batadv_check_known_mac_addr(const struct batadv_hard_iface *hard_iface) { struct net_device *mesh_iface = hard_iface->mesh_iface; const struct batadv_hard_iface *tmp_hard_iface; struct list_head *iter; if (!mesh_iface) return; netdev_for_each_lower_private(mesh_iface, tmp_hard_iface, iter) { if (tmp_hard_iface == hard_iface) continue; if (tmp_hard_iface->if_status == BATADV_IF_NOT_IN_USE) continue; if (!batadv_compare_eth(tmp_hard_iface->net_dev->dev_addr, hard_iface->net_dev->dev_addr)) continue; pr_warn("The newly added mac address (%pM) already exists on: %s\n", hard_iface->net_dev->dev_addr, tmp_hard_iface->net_dev->name); pr_warn("It is strongly recommended to keep mac addresses unique to avoid problems!\n"); } } /** * batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra head/tailroom * @mesh_iface: netdev struct of the mesh interface */ static void batadv_hardif_recalc_extra_skbroom(struct net_device *mesh_iface) { const struct batadv_hard_iface *hard_iface; u