64 26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_RTNH_H #define __NET_RTNH_H #include <linux/rtnetlink.h> #include <net/netlink.h> static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining) { return remaining >= (int)sizeof(*rtnh) && rtnh->rtnh_len >= sizeof(*rtnh) && rtnh->rtnh_len <= remaining; } static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh, int *remaining) { int totlen = NLA_ALIGN(rtnh->rtnh_len); *remaining -= totlen; return (struct rtnexthop *) ((char *) rtnh + totlen); } static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh) { return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh))); } static inline int rtnh_attrlen(const struct rtnexthop *rtnh) { return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh)); } #endif |
1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 | // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_key_cache.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_gc.h" #include "btree_write_buffer.h" #include "buckets.h" #include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" #include "ec.h" #include "error.h" #include "lru.h" #include "recovery.h" #include "trace.h" #include "varint.h" #include <linux/kthread.h> #include <linux/math64.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/sched/task.h> #include <linux/sort.h> static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); /* Persistent alloc info: */ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, BCH_ALLOC_FIELDS_V1() #undef x }; struct bkey_alloc_unpacked { u64 journal_seq; u8 gen; u8 oldest_gen; u8 data_type; bool need_discard:1; bool need_inc_gen:1; #define x(_name, _bits) u##_bits _name; BCH_ALLOC_FIELDS_V2() #undef x }; static inline u64 alloc_field_v1_get(const struct bch_alloc *a, const void **p, unsigned field) { unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; u64 v; if (!(a->fields & (1 << field))) return 0; switch (bytes) { case 1: v = *((const u8 *) *p); break; case 2: v = le16_to_cpup(*p); break; case 4: v = le32_to_cpup(*p); break; case 8: v = le64_to_cpup(*p); break; default: BUG(); } *p += bytes; return v; } static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, struct bkey_s_c k) { const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; const void *d = in->data; unsigned idx = 0; out->gen = in->gen; #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); BCH_ALLOC_FIELDS_V1() #undef x } static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, struct bkey_s_c k) { struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); const u8 *in = a.v->data; const u8 *end = bkey_val_end(a); unsigned fieldnr = 0; int ret; u64 v; out->gen = a.v->gen; out->oldest_gen = a.v->oldest_gen; out->data_type = a.v->data_type; #define x(_name, _bits) \ if (fieldnr < a.v->nr_fields) { \ ret = bch2_varint_decode_fast(in, end, &v); \ if (ret < 0) \ return ret; \ in += ret; \ } else { \ v = 0; \ } \ out->_name = v; \ if (v != out->_name) \ return -1; \ fieldnr++; BCH_ALLOC_FIELDS_V2() #undef x return 0; } static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, struct bkey_s_c k) { struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); const u8 *in = a.v->data; const u8 *end = bkey_val_end(a); unsigned fieldnr = 0; int ret; u64 v; out->gen = a.v->gen; out->oldest_gen = a.v->oldest_gen; out->data_type = a.v->data_type; out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); out->journal_seq = le64_to_cpu(a.v->journal_seq); #define x(_name, _bits) \ if (fieldnr < a.v->nr_fields) { \ ret = bch2_varint_decode_fast(in, end, &v); \ if (ret < 0) \ return ret; \ in += ret; \ } else { \ v = 0; \ } \ out->_name = v; \ if (v != out->_name) \ return -1; \ fieldnr++; BCH_ALLOC_FIELDS_V2() #undef x return 0; } static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) { struct bkey_alloc_unpacked ret = { .gen = 0 }; switch (k.k->type) { case KEY_TYPE_alloc: bch2_alloc_unpack_v1(&ret, k); break; case KEY_TYPE_alloc_v2: bch2_alloc_unpack_v2(&ret, k); break; case KEY_TYPE_alloc_v3: bch2_alloc_unpack_v3(&ret, k); break; } return ret; } static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) { unsigned i, bytes = offsetof(struct bch_alloc, data); for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) if (a->fields & (1 << i)) bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; return DIV_ROUND_UP(bytes, sizeof(u64)); } int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); int ret = 0; /* allow for unknown fields */ bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err, alloc_v1_val_size_bad, "incorrect value size (%zu < %u)", bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); fsck_err: return ret; } int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; int ret = 0; bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; int ret = 0; bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); int ret = 0; bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err, alloc_v4_val_size_bad, "bad val size (%u > %zu)", alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k)); bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, alloc_v4_backpointers_start_bad, "invalid backpointers_start"); bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err, alloc_key_data_type_bad, "invalid data type (got %u should be %u)", a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); switch (a.v->data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe, c, err, alloc_key_empty_but_have_data, "empty data type free but have data"); break; case BCH_DATA_sb: case BCH_DATA_journal: case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), c, err, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", bch2_data_type_str(a.v->data_type)); break; case BCH_DATA_cached: bkey_fsck_err_on(!a.v->cached_sectors || bch2_bucket_sectors_dirty(*a.v) || a.v->stripe, c, err, alloc_key_cached_inconsistency, "data type inconsistency"); bkey_fsck_err_on(!a.v->io_time[READ] && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, c, err, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; case BCH_DATA_stripe: break; } fsck_err: return ret; } void bch2_alloc_v4_swab(struct bkey_s k) { struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; struct bch_backpointer *bp, *bps; a->journal_seq = swab64(a->journal_seq); a->flags = swab32(a->flags); a->dirty_sectors = swab32(a->dirty_sectors); a->cached_sectors = swab32(a->cached_sectors); a->io_time[0] = swab64(a->io_time[0]); a->io_time[1] = swab64(a->io_time[1]); a->stripe = swab32(a->stripe); a->nr_external_backpointers = swab32(a->nr_external_backpointers); a->fragmentation_lru = swab64(a->fragmentation_lru); bps = alloc_v4_backpointers(a); for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { bp->bucket_offset = swab40(bp->bucket_offset); bp->bucket_len = swab32(bp->bucket_len); bch2_bpos_swab(&bp->pos); } } void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); prt_newline(out); printbuf_indent_add(out, 2); prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); bch2_prt_data_type(out, a->data_type); prt_newline(out); prt_printf(out, "journal_seq %llu", a->journal_seq); prt_newline(out); prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); prt_newline(out); prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); prt_newline(out); prt_printf(out, "dirty_sectors %u", a->dirty_sectors); prt_newline(out); prt_printf(out, "cached_sectors %u", a->cached_sectors); prt_newline(out); prt_printf(out, "stripe %u", a->stripe); prt_newline(out); prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); prt_newline(out); prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); prt_newline(out); prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); prt_newline(out); prt_printf(out, "fragmentation %llu", a->fragmentation_lru); prt_newline(out); prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); printbuf_indent_sub(out, 2); } void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) { if (k.k->type == KEY_TYPE_alloc_v4) { void *src, *dst; *out = *bkey_s_c_to_alloc_v4(k).v; src = alloc_v4_backpointers(out); SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); dst = alloc_v4_backpointers(out); if (src < dst) memset(src, 0, dst - src); SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); } else { struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); *out = (struct bch_alloc_v4) { .journal_seq = u.journal_seq, .flags = u.need_discard, .gen = u.gen, .oldest_gen = u.oldest_gen, .data_type = u.data_type, .stripe_redundancy = u.stripe_redundancy, .dirty_sectors = u.dirty_sectors, .cached_sectors = u.cached_sectors, .io_time[READ] = u.read_time, .io_time[WRITE] = u.write_time, .stripe = u.stripe, }; SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); } } static noinline struct bkey_i_alloc_v4 * __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_i_alloc_v4 *ret; ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); if (IS_ERR(ret)) return ret; if (k.k->type == KEY_TYPE_alloc_v4) { void *src, *dst; bkey_reassemble(&ret->k_i, k); src = alloc_v4_backpointers(&ret->v); SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); dst = alloc_v4_backpointers(&ret->v); if (src < dst) memset(src, 0, dst - src); SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); set_alloc_v4_u64s(ret); } else { bkey_alloc_v4_init(&ret->k_i); ret->k.p = k.k->p; bch2_alloc_to_v4(k, &ret->v); } return ret; } static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_s_c_alloc_v4 a; if (likely(k.k->type == KEY_TYPE_alloc_v4) && ((a = bkey_s_c_to_alloc_v4(k), true) && BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); return __bch2_alloc_to_v4_mut(trans, k); } struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) { return bch2_alloc_to_v4_mut_inlined(trans, k); } struct bkey_i_alloc_v4 * bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos) { struct bkey_s_c k; struct bkey_i_alloc_v4 *a; int ret; k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, BTREE_ITER_WITH_UPDATES| BTREE_ITER_CACHED| BTREE_ITER_INTENT); ret = bkey_err(k); if (unlikely(ret)) return ERR_PTR(ret); a = bch2_alloc_to_v4_mut_inlined(trans, k); ret = PTR_ERR_OR_ZERO(a); if (unlikely(ret)) goto err; return a; err: bch2_trans_iter_exit(trans, iter); return ERR_PTR(ret); } static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) { *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; return pos; } static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) { pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; pos.offset += offset; return pos; } static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) { return k.k->type == KEY_TYPE_bucket_gens ? bkey_s_c_to_bucket_gens(k).v->gens[offset] : 0; } int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { int ret = 0; bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err, bucket_gens_val_size_bad, "bad val size (%zu != %zu)", bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); fsck_err: return ret; } void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); unsigned i; for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { if (i) prt_char(out, ' '); prt_printf(out, "%u", g.v->gens[i]); } } int bch2_bucket_gens_init(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct bkey_i_bucket_gens g; bool have_bucket_gens_key = false; int ret; ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ if (!bch2_dev_bucket_exists(c, k.k->p)) continue; struct bch_alloc_v4 a; u8 gen = bch2_alloc_to_v4(k, &a)->gen; unsigned offset; struct bpos pos = alloc_gens_pos(iter.pos, &offset); int ret2 = 0; if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret2) goto iter_err; have_bucket_gens_key = false; } if (!have_bucket_gens_key) { bkey_bucket_gens_init(&g.k_i); g.k.p = pos; have_bucket_gens_key = true; } g.v.gens[offset] = gen; iter_err: ret2; })); if (have_bucket_gens_key && !ret) ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } int bch2_alloc_read(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); int ret; down_read(&c->gc_lock); if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH, k, ({ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; if (k.k->type != KEY_TYPE_bucket_gens) continue; const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ if (!bch2_dev_exists2(c, k.k->p.inode)) continue; struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); for (u64 b = max_t(u64, ca->mi.first_bucket, start); b < min_t(u64, ca->mi.nbuckets, end); b++) *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; 0; })); } else { ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ if (!bch2_dev_bucket_exists(c, k.k->p)) continue; struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; 0; })); } bch2_trans_put(trans); up_read(&c->gc_lock); bch_err_fn(c, ret); return ret; } /* Free space/discard btree: */ static int bch2_bucket_do_index(struct btree_trans *trans, struct bkey_s_c alloc_k, const struct bch_alloc_v4 *a, bool set) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); struct btree_iter iter; struct bkey_s_c old; struct bkey_i *k; enum btree_id btree; enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; struct printbuf buf = PRINTBUF; int ret; if (a->data_type != BCH_DATA_free && a->data_type != BCH_DATA_need_discard) return 0; k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); if (IS_ERR(k)) return PTR_ERR(k); bkey_init(&k->k); k->k.type = new_type; switch (a->data_type) { case BCH_DATA_free: btree = BTREE_ID_freespace; k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); bch2_key_resize(&k->k, 1); break; case BCH_DATA_need_discard: btree = BTREE_ID_need_discard; k->k.p = alloc_k.k->p; break; default: return 0; } old = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(&k->k), BTREE_ITER_INTENT); ret = bkey_err(old); if (ret) return ret; if (ca->mi.freespace_initialized && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && bch2_trans_inconsistent_on(old.k->type != old_type, trans, "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" " for %s", set ? "setting" : "clearing", bch2_btree_id_str(btree), iter.pos.inode, iter.pos.offset, bch2_bkey_types[old.k->type], bch2_bkey_types[old_type], (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ret = -EIO; goto err; } ret = bch2_trans_update(trans, &iter, k, 0); err: bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; } static noinline int bch2_bucket_gen_update(struct btree_trans *trans, struct bpos bucket, u8 gen) { struct btree_iter iter; unsigned offset; struct bpos pos = alloc_gens_pos(bucket, &offset); struct bkey_i_bucket_gens *g; struct bkey_s_c k; int ret; g = bch2_trans_kmalloc(trans, sizeof(*g)); ret = PTR_ERR_OR_ZERO(g); if (ret) return ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, BTREE_ITER_INTENT| BTREE_ITER_WITH_UPDATES); ret = bkey_err(k); if (ret) return ret; if (k.k->type != KEY_TYPE_bucket_gens) { bkey_bucket_gens_init(&g->k_i); g->k.p = iter.pos; } else { bkey_reassemble(&g->k_i, k); } g->v.gens[offset] = gen; ret = bch2_trans_update(trans, &iter, &g->k_i, 0); bch2_trans_iter_exit(trans, &iter); return ret; } int bch2_trigger_alloc(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, unsigned flags) { struct bch_fs *c = trans->c; int ret = 0; if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, "alloc key for invalid device or bucket")) return -EIO; struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); if (flags & BTREE_TRIGGER_TRANSACTIONAL) { struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; new_a->data_type = alloc_data_type(*new_a, new_a->data_type); if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) { new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } if (data_type_is_empty(new_a->data_type) && BCH_ALLOC_V4_NEED_INC_GEN(new_a) && !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); } if (old_a->data_type != new_a->data_type || (new_a->data_type == BCH_DATA_free && alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { ret = bch2_bucket_do_index(trans, old, old_a, false) ?: bch2_bucket_do_index(trans, new.s_c, new_a, true); if (ret) return ret; } if (new_a->data_type == BCH_DATA_cached && !new_a->io_time[READ]) new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); u64 old_lru = alloc_lru_idx_read(*old_a); u64 new_lru = alloc_lru_idx_read(*new_a); if (old_lru != new_lru) { ret = bch2_lru_change(trans, new.k->p.inode, bucket_to_u64(new.k->p), old_lru, new_lru); if (ret) return ret; } new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, bch_dev_bkey_exists(c, new.k->p.inode)); if (old_a->fragmentation_lru != new_a->fragmentation_lru) { ret = bch2_lru_change(trans, BCH_LRU_FRAGMENTATION_START, bucket_to_u64(new.k->p), old_a->fragmentation_lru, new_a->fragmentation_lru); if (ret) return ret; } if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); if (ret) return ret; } /* * need to know if we're getting called from the invalidate path or * not: */ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && old_a->cached_sectors) { ret = bch2_update_cached_sectors_list(trans, new.k->p.inode, -((s64) old_a->cached_sectors)); if (ret) return ret; } } if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; u64 journal_seq = trans->journal_res.seq; u64 bucket_journal_seq = new_a->journal_seq; if ((flags & BTREE_TRIGGER_INSERT) && data_type_is_empty(old_a->data_type) != data_type_is_empty(new_a->data_type) && new.k->type == KEY_TYPE_alloc_v4) { struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; /* * If the btree updates referring to a bucket weren't flushed * before the bucket became empty again, then the we don't have * to wait on a journal flush before we can reuse the bucket: */ v->journal_seq = bucket_journal_seq = data_type_is_empty(new_a->data_type) && (journal_seq == v->journal_seq || bch2_journal_noflush_seq(&c->journal, v->journal_seq)) ? 0 : journal_seq; } if (!data_type_is_empty(old_a->data_type) && data_type_is_empty(new_a->data_type) && bucket_journal_seq) { ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, new.k->p.inode, new.k->p.offset, bucket_journal_seq); if (ret) { bch2_fs_fatal_error(c, "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)); return ret; } } percpu_down_read(&c->mark_lock); if (new_a->gen != old_a->gen) *bucket_gen(ca, new.k->p.offset) = new_a->gen; bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); percpu_up_read(&c->mark_lock); #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) #define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) if (statechange(a->data_type == BCH_DATA_free) && bucket_flushed(new_a)) closure_wake_up(&c->freelist_wait); if (statechange(a->data_type == BCH_DATA_need_discard) && !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && bucket_flushed(new_a)) bch2_discard_one_bucket_fast(c, new.k->p); if (statechange(a->data_type == BCH_DATA_cached) && !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) bch2_do_invalidates(c); if (statechange(a->data_type == BCH_DATA_need_gc_gens)) bch2_do_gc_gens(c); } if ((flags & BTREE_TRIGGER_GC) && (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) { struct bch_alloc_v4 new_a_convert; const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert); percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, new.k->p.offset); bucket_lock(g); g->gen_valid = 1; g->gen = new_a->gen; g->data_type = new_a->data_type; g->stripe = new_a->stripe; g->stripe_redundancy = new_a->stripe_redundancy; g->dirty_sectors = new_a->dirty_sectors; g->cached_sectors = new_a->cached_sectors; bucket_unlock(g); percpu_up_read(&c->mark_lock); } return 0; } /* * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for * extents style btrees, but works on non-extents btrees: */ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) { struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) return k; if (k.k->type) { return k; } else { struct btree_iter iter2; struct bpos next; bch2_trans_copy_iter(&iter2, iter); struct btree_path *path = btree_iter_path(iter->trans, iter); if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); /* * btree node min/max is a closed interval, upto takes a half * open interval: */ k = bch2_btree_iter_peek_upto(&iter2, end); next = iter2.pos; bch2_trans_iter_exit(iter->trans, &iter2); BUG_ON(next.offset >= iter->pos.offset + U32_MAX); if (bkey_err(k)) return k; bkey_init(hole); hole->p = iter->pos; bch2_key_resize(hole, next.offset - iter->pos.offset); return (struct bkey_s_c) { hole, NULL }; } } static bool next_bucket(struct bch_fs *c, struct bpos *bucket) { struct bch_dev *ca; if (bch2_dev_bucket_exists(c, *bucket)) return true; if (bch2_dev_exists2(c, bucket->inode)) { ca = bch_dev_bkey_exists(c, bucket->inode); if (bucket->offset < ca->mi.first_bucket) { bucket->offset = ca->mi.first_bucket; return true; } bucket->inode++; bucket->offset = 0; } rcu_read_lock(); ca = __bch2_next_dev_idx(c, bucket->inode, NULL); if (ca) *bucket = POS(ca->dev_idx, ca->mi.first_bucket); rcu_read_unlock(); return ca != NULL; } static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) { struct bch_fs *c = iter->trans->c; struct bkey_s_c k; again: k = bch2_get_key_or_hole(iter, POS_MAX, hole); if (bkey_err(k)) return k; if (!k.k->type) { struct bpos bucket = bkey_start_pos(k.k); if (!bch2_dev_bucket_exists(c, bucket)) { if (!next_bucket(c, &bucket)) return bkey_s_c_null; bch2_btree_iter_set_pos(iter, bucket); goto again; } if (!bch2_dev_bucket_exists(c, k.k->p)) { struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset); } } return k; } static noinline_for_stack int bch2_check_alloc_key(struct btree_trans *trans, struct bkey_s_c alloc_k, struct btree_iter *alloc_iter, struct btree_iter *discard_iter, struct btree_iter *freespace_iter, struct btree_iter *bucket_gens_iter) { struct bch_fs *c = trans->c; struct bch_dev *ca; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; unsigned discard_key_type, freespace_key_type; unsigned gens_offset; struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, alloc_key_to_missing_dev_bucket, "alloc key for invalid device:bucket %llu:%llu", alloc_k.k->p.inode, alloc_k.k->p.offset)) return bch2_btree_delete_at(trans, alloc_iter, 0); ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); if (!ca->mi.freespace_initialized) return 0; a = bch2_alloc_to_v4(alloc_k, &a_convert); discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); k = bch2_btree_iter_peek_slot(discard_iter); ret = bkey_err(k); if (ret) goto err; if (fsck_err_on(k.k->type != discard_key_type, c, need_discard_key_wrong, "incorrect key in need_discard btree (got %s should be %s)\n" " %s", bch2_bkey_types[k.k->type], bch2_bkey_types[discard_key_type], (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); ret = PTR_ERR_OR_ZERO(update); if (ret) goto err; bkey_init(&update->k); update->k.type = discard_key_type; update->k.p = discard_iter->pos; ret = bch2_trans_update(trans, discard_iter, update, 0); if (ret) goto err; } freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); k = bch2_btree_iter_peek_slot(freespace_iter); ret = bkey_err(k); if (ret) goto err; if (fsck_err_on(k.k->type != freespace_key_type, c, freespace_key_wrong, "incorrect key in freespace btree (got %s should be %s)\n" " %s", bch2_bkey_types[k.k->type], bch2_bkey_types[freespace_key_type], (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); ret = PTR_ERR_OR_ZERO(update); if (ret) goto err; bkey_init(&update->k); update->k.type = freespace_key_type; update->k.p = freespace_iter->pos; bch2_key_resize(&update->k, 1); ret = bch2_trans_update(trans, freespace_iter, update, 0); if (ret) goto err; } bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); k = bch2_btree_iter_peek_slot(bucket_gens_iter); ret = bkey_err(k); if (ret) goto err; if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), c, bucket_gens_key_wrong, "incorrect gen in bucket_gens btree (got %u should be %u)\n" " %s", alloc_gen(k, gens_offset), a->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i_bucket_gens *g = bch2_trans_kmalloc(trans, sizeof(*g)); ret = PTR_ERR_OR_ZERO(g); if (ret) goto err; if (k.k->type == KEY_TYPE_bucket_gens) { bkey_reassemble(&g->k_i, k); } else { bkey_bucket_gens_init(&g->k_i); g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); } g->v.gens[gens_offset] = a->gen; ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); if (ret) goto err; } err: fsck_err: printbuf_exit(&buf); return ret; } static noinline_for_stack int bch2_check_alloc_hole_freespace(struct btree_trans *trans, struct bpos start, struct bpos *end, struct btree_iter *freespace_iter) { struct bch_fs *c = trans->c; struct bch_dev *ca; struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; ca = bch_dev_bkey_exists(c, start.inode); if (!ca->mi.freespace_initialized) return 0; bch2_btree_iter_set_pos(freespace_iter, start); k = bch2_btree_iter_peek_slot(freespace_iter); ret = bkey_err(k); if (ret) goto err; *end = bkey_min(k.k->p, *end); if (fsck_err_on(k.k->type != KEY_TYPE_set, c, freespace_hole_missing, "hole in alloc btree missing in freespace btree\n" " device %llu buckets %llu-%llu", freespace_iter->pos.inode, freespace_iter->pos.offset, end->offset)) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); ret = PTR_ERR_OR_ZERO(update); if (ret) goto err; bkey_init(&update->k); update->k.type = KEY_TYPE_set; update->k.p = freespace_iter->pos; bch2_key_resize(&update->k, min_t(u64, U32_MAX, end->offset - freespace_iter->pos.offset)); ret = bch2_trans_update(trans, freespace_iter, update, 0); if (ret) goto err; } err: fsck_err: printbuf_exit(&buf); return ret; } static noinline_for_stack int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, struct bpos start, struct bpos *end, struct btree_iter *bucket_gens_iter) { struct bch_fs *c = trans->c; struct bkey_s_c k; struct printbuf buf = PRINTBUF; unsigned i, gens_offset, gens_end_offset; int ret; bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); k = bch2_btree_iter_peek_slot(bucket_gens_iter); ret = bkey_err(k); if (ret) goto err; if (bkey_cmp(alloc_gens_pos(start, &gens_offset), alloc_gens_pos(*end, &gens_end_offset))) gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; if (k.k->type == KEY_TYPE_bucket_gens) { struct bkey_i_bucket_gens g; bool need_update = false; bkey_reassemble(&g.k_i, k); for (i = gens_offset; i < gens_end_offset; i++) { if (fsck_err_on(g.v.gens[i], c, bucket_gens_hole_wrong, "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", bucket_gens_pos_to_alloc(k.k->p, i).inode, bucket_gens_pos_to_alloc(k.k->p, i).offset, g.v.gens[i])) { g.v.gens[i] = 0; need_update = true; } } if (need_update) { struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); ret = PTR_ERR_OR_ZERO(u); if (ret) goto err; memcpy(u, &g, sizeof(g)); ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); if (ret) goto err; } } *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); err: fsck_err: printbuf_exit(&buf); return ret; } static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter) { struct bch_fs *c = trans->c; struct btree_iter alloc_iter; struct bkey_s_c alloc_k; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; u64 genbits; struct bpos pos; enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard ? BCH_DATA_need_discard : BCH_DATA_free; struct printbuf buf = PRINTBUF; int ret; pos = iter->pos; pos.offset &= ~(~0ULL << 56); genbits = iter->pos.offset & (~0ULL << 56); alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); ret = bkey_err(alloc_k); if (ret) return ret; if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, need_discard_freespace_key_to_invalid_dev_bucket, "entry in %s btree for nonexistant dev:bucket %llu:%llu", bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) goto delete; a = bch2_alloc_to_v4(alloc_k, &a_convert); if (fsck_err_on(a->data_type != state || (state == BCH_DATA_free && genbits != alloc_freespace_genbits(*a)), c, need_discard_freespace_key_bad, "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), bch2_btree_id_str(iter->btree_id), iter->pos.inode, iter->pos.offset, a->data_type == state, genbits >> 56, alloc_freespace_genbits(*a) >> 56)) goto delete; out: fsck_err: set_btree_iter_dontneed(&alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; delete: ret = bch2_btree_delete_extent_at(trans, iter, iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); goto out; } /* * We've already checked that generation numbers in the bucket_gens btree are * valid for buckets that exist; this just checks for keys for nonexistent * buckets. */ static noinline_for_stack int bch2_check_bucket_gens_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_i_bucket_gens g; struct bch_dev *ca; u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; u64 b; bool need_update = false, dev_exists; struct printbuf buf = PRINTBUF; int ret = 0; BUG_ON(k.k->type != KEY_TYPE_bucket_gens); bkey_reassemble(&g.k_i, k); /* if no bch_dev, skip out whether we repair or not */ dev_exists = bch2_dev_exists2(c, k.k->p.inode); if (!dev_exists) { if (fsck_err_on(!dev_exists, c, bucket_gens_to_invalid_dev, "bucket_gens key for invalid device:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); } goto out; } ca = bch_dev_bkey_exists(c, k.k->p.inode); if (fsck_err_on(end <= ca->mi.first_bucket || start >= ca->mi.nbuckets, c, bucket_gens_to_invalid_buckets, "bucket_gens key for invalid buckets:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); goto out; } for (b = start; b < ca->mi.first_bucket; b++) if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; } for (b = ca->mi.nbuckets; b < end; b++) if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; } if (need_update) { struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); ret = PTR_ERR_OR_ZERO(u); if (ret) goto out; memcpy(u, &g, sizeof(g)); ret = bch2_trans_update(trans, iter, u, 0); } out: fsck_err: printbuf_exit(&buf); return ret; } int bch2_check_alloc_info(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; struct bkey hole; struct bkey_s_c k; int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH); bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH); bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_PREFETCH); bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH); while (1) { struct bpos next; bch2_trans_begin(trans); k = bch2_get_key_or_real_bucket_hole(&iter, &hole); ret = bkey_err(k); if (ret) goto bkey_err; if (!k.k) break; if (k.k->type) { next = bpos_nosnap_successor(k.k->p); ret = bch2_check_alloc_key(trans, k, &iter, &discard_iter, &freespace_iter, &bucket_gens_iter); if (ret) goto bkey_err; } else { next = k.k->p; ret = bch2_check_alloc_hole_freespace(trans, bkey_start_pos(k.k), &next, &freespace_iter) ?: bch2_check_alloc_hole_bucket_gens(trans, bkey_start_pos(k.k), &next, &bucket_gens_iter); if (ret) goto bkey_err; } ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; bch2_btree_iter_set_pos(&iter, next); bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; } bch2_trans_iter_exit(trans, &bucket_gens_iter); bch2_trans_iter_exit(trans, &freespace_iter); bch2_trans_iter_exit(trans, &discard_iter); bch2_trans_iter_exit(trans, &iter); if (ret < 0) goto err; ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH, k, bch2_check_discard_freespace_key(trans, &iter)); if (ret) goto err; bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_PREFETCH); while (1) { bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); if (!k.k) break; ret = bkey_err(k) ?: bch2_check_discard_freespace_key(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; } if (ret) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); bch_err(c, "while checking %s", buf.buf); printbuf_exit(&buf); break; } bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); } bch2_trans_iter_exit(trans, &iter); if (ret) goto err; ret = for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_bucket_gens_key(trans, &iter, k)); err: bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, struct btree_iter *alloc_iter) { struct bch_fs *c = trans->c; struct btree_iter lru_iter; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; struct bkey_s_c alloc_k, lru_k; struct printbuf buf = PRINTBUF; int ret; alloc_k = bch2_btree_iter_peek(alloc_iter); if (!alloc_k.k) return 0; ret = bkey_err(alloc_k); if (ret) return ret; a = bch2_alloc_to_v4(alloc_k, &a_convert); if (a->data_type != BCH_DATA_cached) return 0; if (fsck_err_on(!a->io_time[READ], c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time 0\n" " %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i_alloc_v4 *a_mut = bch2_alloc_to_v4_mut(trans, alloc_k); ret = PTR_ERR_OR_ZERO(a_mut); if (ret) goto err; a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); ret = bch2_trans_update(trans, alloc_iter, &a_mut->k_i, BTREE_TRIGGER_NORUN); if (ret) goto err; a = &a_mut->v; } lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, lru_pos(alloc_k.k->p.inode, bucket_to_u64(alloc_k.k->p), a->io_time[READ]), 0); ret = bkey_err(lru_k); if (ret) return ret; if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, alloc_key_to_missing_lru_entry, "missing lru entry\n" " %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ret = bch2_lru_set(trans, alloc_k.k->p.inode, bucket_to_u64(alloc_k.k->p), a->io_time[READ]); if (ret) goto err; } err: fsck_err: bch2_trans_iter_exit(trans, &lru_iter); printbuf_exit(&buf); return ret; } int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_alloc_to_lru_ref(trans, &iter))); bch_err_fn(c, ret); return ret; } static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) { int ret; mutex_lock(&c->discard_buckets_in_flight_lock); darray_for_each(c->discard_buckets_in_flight, i) if (bkey_eq(*i, bucket)) { ret = -EEXIST; goto out; } ret = darray_push(&c->discard_buckets_in_flight, bucket); out: mutex_unlock(&c->discard_buckets_in_flight_lock); return ret; } static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) { mutex_lock(&c->discard_buckets_in_flight_lock); darray_for_each(c->discard_buckets_in_flight, i) if (bkey_eq(*i, bucket)) { darray_remove_item(&c->discard_buckets_in_flight, i); goto found; } BUG(); found: mutex_unlock(&c->discard_buckets_in_flight_lock); } struct discard_buckets_state { u64 seen; u64 open; u64 need_journal_commit; u64 discarded; struct bch_dev *ca; u64 need_journal_commit_this_dev; }; static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) { if (s->ca == ca) return; if (s->ca && s->need_journal_commit_this_dev > bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) bch2_journal_flush_async(&c->journal, NULL); if (s->ca) percpu_ref_put(&s->ca->ref); if (ca) percpu_ref_get(&ca->ref); s->ca = ca; s->need_journal_commit_this_dev = 0; } static int bch2_discard_one_bucket(struct btree_trans *trans, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, struct discard_buckets_state *s) { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; struct btree_iter iter = { NULL }; struct bkey_s_c k; struct bch_dev *ca; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; bool discard_locked = false; int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); if (!percpu_ref_tryget(&ca->io_ref)) { bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); return 0; } discard_buckets_next_dev(c, s, ca); if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { s->open++; goto out; } if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, pos.inode, pos.offset)) { s->need_journal_commit++; s->need_journal_commit_this_dev++; goto out; } k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, need_discard_iter->pos, BTREE_ITER_CACHED); ret = bkey_err(k); if (ret) goto out; a = bch2_alloc_to_v4_mut(trans, k); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; if (a->v.dirty_sectors) { if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, trans, "attempting to discard bucket with dirty data\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = -EIO; goto out; } if (a->v.data_type != BCH_DATA_need_discard) { if (data_type_is_empty(a->v.data_type) && BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { a->v.gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); goto write; } if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, trans, "bucket incorrectly set in need_discard btree\n" "%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = -EIO; goto out; } if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", a->v.journal_seq, c->journal.flushed_seq_ondisk, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = -EIO; goto out; } if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) goto out; discard_locked = true; if (!bkey_eq(*discard_pos_done, iter.pos) && ca->mi.discard && !c->opts.nochanges) { /* * This works without any other locks because this is the only * thread that removes items from the need_discard tree */ bch2_trans_unlock_long(trans); blkdev_issue_discard(ca->disk_sb.bdev, k.k->p.offset * ca->mi.bucket_size, ca->mi.bucket_size, GFP_KERNEL); *discard_pos_done = iter.pos; ret = bch2_trans_relock_notrace(trans); if (ret) goto out; } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); a->v.data_type = alloc_data_type(a->v, a->v.data_type); write: ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; count_event(c, bucket_discard); s->discarded++; out: if (discard_locked) discard_in_flight_remove(c, iter.pos); s->seen++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); return ret; } static void bch2_do_discards_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, discard_work); struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; /* * We're doing the commit in bch2_discard_one_bucket instead of using * for_each_btree_key_commit() so that we can increment counters after * successful commit: */ ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, 0, k, bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); discard_buckets_next_dev(c, &s, NULL); trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) { if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && !queue_work(c->write_ref_wq, &c->discard_work)) bch2_write_ref_put(c, BCH_WRITE_REF_discard); } static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) { struct btree_iter iter; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) goto err; struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); ret = PTR_ERR_OR_ZERO(a); if (ret) goto err; BUG_ON(a->v.dirty_sectors); SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); a->v.data_type = alloc_data_type(a->v, a->v.data_type); ret = bch2_trans_update(trans, &iter, &a->k_i, 0); err: bch2_trans_iter_exit(trans, &iter); return ret; } static void bch2_do_discards_fast_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); while (1) { bool got_bucket = false; struct bpos bucket; struct bch_dev *ca; mutex_lock(&c->discard_buckets_in_flight_lock); darray_for_each(c->discard_buckets_in_flight, i) { if (i->snapshot) continue; ca = bch_dev_bkey_exists(c, i->inode); if (!percpu_ref_tryget(&ca->io_ref)) { darray_remove_item(&c->discard_buckets_in_flight, i); continue; } got_bucket = true; bucket = *i; i->snapshot = true; break; } mutex_unlock(&c->discard_buckets_in_flight_lock); if (!got_bucket) break; if (ca->mi.discard && !c->opts.nochanges) blkdev_issue_discard(ca->disk_sb.bdev, bucket.offset * ca->mi.bucket_size, ca->mi.bucket_size, GFP_KERNEL); int ret = bch2_trans_do(c, NULL, NULL, BCH_WATERMARK_btree| BCH_TRANS_COMMIT_no_enospc, bch2_clear_bucket_needs_discard(trans, bucket)); bch_err_fn(c, ret); percpu_ref_put(&ca->io_ref); discard_in_flight_remove(c, bucket); if (ret) break; } bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) { struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); if (!percpu_ref_is_dying(&ca->io_ref) && !discard_in_flight_add(c, bucket) && bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && !queue_work(c->write_ref_wq, &c->discard_fast_work)) bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } static int invalidate_one_bucket(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; struct btree_iter alloc_iter = { NULL }; struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); unsigned cached_sectors; int ret = 0; if (*nr_to_invalidate <= 0) return 1; if (!bch2_dev_bucket_exists(c, bucket)) { prt_str(&buf, "lru entry points to invalid bucket"); goto err; } if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; /* We expect harmless races here due to the btree write buffer: */ if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) goto out; BUG_ON(a->v.data_type != BCH_DATA_cached); BUG_ON(a->v.dirty_sectors); if (!a->v.cached_sectors) bch_err(c, "invalidating empty bucket, confused"); cached_sectors = a->v.cached_sectors; SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); a->v.gen++; a->v.data_type = 0; a->v.dirty_sectors = 0; a->v.cached_sectors = 0; a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, BTREE_TRIGGER_BUCKET_INVALIDATE) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); --*nr_to_invalidate; out: bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; err: prt_str(&buf, "\n lru key: "); bch2_bkey_val_to_text(&buf, c, lru_k); prt_str(&buf, "\n lru entry: "); bch2_lru_pos_to_text(&buf, lru_iter->pos); prt_str(&buf, "\n alloc key: "); if (!a) bch2_bpos_to_text(&buf, bucket); else bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); bch_err(c, "%s", buf.buf); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { bch2_inconsistent_error(c); ret = -EINVAL; } goto out; } static void bch2_do_invalidates_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); struct btree_trans *trans = bch2_trans_get(c); int ret = 0; ret = bch2_btree_write_buffer_tryflush(trans); if (ret) goto err; for_each_member_device(c, ca) { s64 nr_to_invalidate = should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(ca->dev_idx, 0, 0), lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), BTREE_ITER_INTENT, k, invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); if (ret < 0) { percpu_ref_put(&ca->ref); break; } } err: bch2_trans_put(trans); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) { if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && !queue_work(c->write_ref_wq, &c->invalidate_work)) bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, u64 bucket_start, u64 bucket_end) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bkey hole; struct bpos end = POS(ca->dev_idx, bucket_end); struct bch_member *m; unsigned long last_updated = jiffies; int ret; BUG_ON(bucket_start > bucket_end); BUG_ON(bucket_end > ca->mi.nbuckets); bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), BTREE_ITER_PREFETCH); /* * Scan the alloc btree for every bucket on @ca, and add buckets to the * freespace/need_discard/need_gc_gens btrees as needed: */ while (1) { if (last_updated + HZ * 10 < jiffies) { bch_info(ca, "%s: currently at %llu/%llu", __func__, iter.pos.offset, ca->mi.nbuckets); last_updated = jiffies; } bch2_trans_begin(trans); if (bkey_ge(iter.pos, end)) { ret = 0; break; } k = bch2_get_key_or_hole(&iter, end, &hole); ret = bkey_err(k); if (ret) goto bkey_err; if (k.k->type) { /* * We process live keys in the alloc btree one at a * time: */ struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); ret = bch2_bucket_do_index(trans, k, a, true) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; bch2_btree_iter_advance(&iter); } else { struct bkey_i *freespace; freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); ret = PTR_ERR_OR_ZERO(freespace); if (ret) goto bkey_err; bkey_init(&freespace->k); freespace->k.type = KEY_TYPE_set; freespace->k.p = k.k->p; freespace->k.size = k.k->size; ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; bch2_btree_iter_set_pos(&iter, k.k->p); } bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; } bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); if (ret < 0) { bch_err_msg(ca, ret, "initializing free space"); return ret; } mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); mutex_unlock(&c->sb_lock); return 0; } int bch2_fs_freespace_init(struct bch_fs *c) { int ret = 0; bool doing_init = false; /* * We can crash during the device add path, so we need to check this on * every mount: */ for_each_member_device(c, ca) { if (ca->mi.freespace_initialized) continue; if (!doing_init) { bch_info(c, "initializing freespace"); doing_init = true; } ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { percpu_ref_put(&ca->ref); bch_err_fn(c, ret); return ret; } } if (doing_init) { mutex_lock(&c->sb_lock); bch2_write_super(c); mutex_unlock(&c->sb_lock); bch_verbose(c, "done initializing freespace"); } return 0; } /* Bucket IO clocks: */ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, size_t bucket_nr, int rw) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_i_alloc_v4 *a; u64 now; int ret = 0; a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); ret = PTR_ERR_OR_ZERO(a); if (ret) return ret; now = atomic64_read(&c->io_clock[rw].now); if (a->v.io_time[rw] == now) goto out; a->v.io_time[rw] = now; ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, 0); out: bch2_trans_iter_exit(trans, &iter); return ret; } /* Startup/shutdown (ro/rw): */ void bch2_recalc_capacity(struct bch_fs *c) { u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; lockdep_assert_held(&c->state_lock); for_each_online_member(c, ca) { struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; ra_pages += bdi->ra_pages; } bch2_set_ra_pages(c, ra_pages); for_each_rw_member(c, ca) { u64 dev_reserve = 0; /* * We need to reserve buckets (from the number * of currently available buckets) against * foreground writes so that mainly copygc can * make forward progress. * * We need enough to refill the various reserves * from scratch - copygc will use its entire * reserve all at once, then run against when * its reserve is refilled (from the formerly * available buckets). * * This reserve is just used when considering if * allocations for foreground writes must wait - * not -ENOSPC calculations. */ dev_reserve += ca->nr_btree_reserve * 2; dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ dev_reserve += 1; /* btree write point */ dev_reserve += 1; /* copygc write point */ dev_reserve += 1; /* rebalance write point */ dev_reserve *= ca->mi.bucket_size; capacity += bucket_to_sector(ca, ca->mi.nbuckets - ca->mi.first_bucket); reserved_sectors += dev_reserve * 2; bucket_size_max = max_t(unsigned, bucket_size_max, ca->mi.bucket_size); } gc_reserve = c->opts.gc_reserve_bytes ? c->opts.gc_reserve_bytes >> 9 : div64_u64(capacity * c->opts.gc_reserve_percent, 100); reserved_sectors = max(gc_reserve, reserved_sectors); reserved_sectors = min(reserved_sectors, capacity); c->capacity = capacity - reserved_sectors; c->bucket_size_max = bucket_size_max; /* Wake up case someone was waiting for buckets */ closure_wake_up(&c->freelist_wait); } u64 bch2_min_rw_member_capacity(struct bch_fs *c) { u64 ret = U64_MAX; for_each_rw_member(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); return ret; } static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { struct open_bucket *ob; bool ret = false; for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); if (ob->valid && !ob->on_partial_list && ob->dev == ca->dev_idx) ret = true; spin_unlock(&ob->lock); } return ret; } /* device goes ro: */ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { unsigned i; /* First, remove device from allocation groups: */ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) clear_bit(ca->dev_idx, c->rw_devs[i].d); /* * Capacity is calculated based off of devices in allocation groups: */ bch2_recalc_capacity(c); bch2_open_buckets_stop(c, ca, false); /* * Wake up threads that were blocked on allocation, so they can notice * the device can no longer be removed and the capacity has changed: */ closure_wake_up(&c->freelist_wait); /* * journal_res_get() can block waiting for free space in the journal - * it needs to notice there may not be devices to allocate from anymore: */ wake_up(&c->journal.wait); /* Now wait for any in flight writes: */ closure_wait_event(&c->open_buckets_wait, !bch2_dev_has_open_write_point(c, ca)); } /* device goes rw: */ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { unsigned i; for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) if (ca->mi.data_allowed & (1 << i)) set_bit(ca->dev_idx, c->rw_devs[i].d); } void bch2_fs_allocator_background_exit(struct bch_fs *c) { darray_exit(&c->discard_buckets_in_flight); } void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); mutex_init(&c->discard_buckets_in_flight_lock); INIT_WORK(&c->discard_work, bch2_do_discards_work); INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } |
17 17 51 51 5 67 7 4 33 31 1 454 452 451 58 51 43 452 455 30 7 16 7 23 4 4 9 5 106 106 106 33 30 41 1 44 44 43 42 34 1 1 1 1 32 30 2 32 4 17 9 4 6 8 12 1 30 9 79 64 53 23 17 173 3 1 1 110 9 9 59 24 43 120 5 124 2 26 2 4 148 148 80 60 19 64 9 148 149 37 106 133 126 84 13 3 5 87 149 6 6 26 27 12 1 11 11 10 19 2 3 2 2 12 1 12 4 9 6 6 8 5 5 3 2 1 2 2 1 1 1 2 1 301 296 4 2 1 6 66 56 9 15 1 14 1 12 12 9 2 9 9 2 4 12 12 11 12 9 12 6 6 4 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * RAW - implementation of IP "raw" sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Alan Cox : verify_area() fixed up * Alan Cox : ICMP error handling * Alan Cox : EMSGSIZE if you send too big a packet * Alan Cox : Now uses generic datagrams and shared * skbuff library. No more peek crashes, * no more backlogs * Alan Cox : Checks sk->broadcast. * Alan Cox : Uses skb_free_datagram/skb_copy_datagram * Alan Cox : Raw passes ip options too * Alan Cox : Setsocketopt added * Alan Cox : Fixed error return for broadcasts * Alan Cox : Removed wake_up calls * Alan Cox : Use ttl/tos * Alan Cox : Cleaned up old debugging * Alan Cox : Use new kernel side addresses * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. * Alan Cox : BSD style RAW socket demultiplexing. * Alan Cox : Beginnings of mrouted support. * Alan Cox : Added IP_HDRINCL option. * Alan Cox : Skip broadcast check if BSDism set. * David S. Miller : New socket lookup architecture. */ #include <linux/types.h> #include <linux/atomic.h> #include <asm/byteorder.h> #include <asm/current.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/sockios.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/mroute.h> #include <linux/netdevice.h> #include <linux/in_route.h> #include <linux/route.h> #include <linux/skbuff.h> #include <linux/igmp.h> #include <net/net_namespace.h> #include <net/dst.h> #include <net/sock.h> #include <linux/ip.h> #include <linux/net.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/raw.h> #include <net/snmp.h> #include <net/tcp_states.h> #include <net/inet_common.h> #include <net/checksum.h> #include <net/xfrm.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/uio.h> struct raw_frag_vec { struct msghdr *msg; union { struct icmphdr icmph; char c[1]; } hdr; int hlen; }; struct raw_hashinfo raw_v4_hashinfo; EXPORT_SYMBOL_GPL(raw_v4_hashinfo); int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *hlist; hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)]; spin_lock(&h->lock); sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); spin_unlock(&h->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } EXPORT_SYMBOL_GPL(raw_hash_sk); void raw_unhash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; spin_lock(&h->lock); if (sk_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_unhash_sk); bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif, int sdif) { const struct inet_sock *inet = inet_sk(sk); if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return true; return false; } EXPORT_SYMBOL_GPL(raw_v4_match); /* * 0 - deliver * 1 - block */ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) { struct icmphdr _hdr; const struct icmphdr *hdr; hdr = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_hdr), &_hdr); if (!hdr) return 1; if (hdr->type < 32) { __u32 data = raw_sk(sk)->filter.data; return ((1U << hdr->type) & data) != 0; } /* Do not block unknown ICMP types */ return 0; } /* IP input processing comes here for RAW socket delivery. * Caller owns SKB, so we must make clones. * * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ static int raw_v4_input(struct net *net, struct sk_buff *skb, const struct iphdr *iph, int hash) { int sdif = inet_sdif(skb); struct hlist_head *hlist; int dif = inet_iif(skb); int delivered = 0; struct sock *sk; hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { if (!raw_v4_match(net, sk, iph->protocol, iph->saddr, iph->daddr, dif, sdif)) continue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); continue; } delivered = 1; if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && ip_mc_sf_allow(sk, iph->daddr, iph->saddr, skb->dev->ifindex, sdif)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ if (clone) raw_rcv(sk, clone); } } rcu_read_unlock(); return delivered; } int raw_local_deliver(struct sk_buff *skb, int protocol) { struct net *net = dev_net(skb->dev); return raw_v4_input(net, skb, ip_hdr(skb), raw_hashfunc(net, protocol)); } static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) { struct inet_sock *inet = inet_sk(sk); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; int harderr = 0; bool recverr; int err = 0; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_sk_update_pmtu(skb, sk, info); else if (type == ICMP_REDIRECT) { ipv4_sk_redirect(skb, sk); return; } /* Report error on raw socket, if: 1. User requested ip_recverr. 2. Socket is connected (otherwise the error indication is useless without ip_recverr and error is hard. */ recverr = inet_test_bit(RECVERR, sk); if (!recverr && sk->sk_state != TCP_ESTABLISHED) return; switch (type) { default: case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: return; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: err = EHOSTUNREACH; if (code > NR_ICMP_UNREACH) break; if (code == ICMP_FRAG_NEEDED) { harderr = READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT; err = EMSGSIZE; } else { err = icmp_err_convert[code].errno; harderr = icmp_err_convert[code].fatal; } } if (recverr) { const struct iphdr *iph = (const struct iphdr *)skb->data; u8 *payload = skb->data + (iph->ihl << 2); if (inet_test_bit(HDRINCL, sk)) payload = skb->data; ip_icmp_error(sk, skb, err, 0, info, payload); } if (recverr || harderr) { sk->sk_err = err; sk_error_report(sk); } } void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) { struct net *net = dev_net(skb->dev); int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); struct hlist_head *hlist; const struct iphdr *iph; struct sock *sk; int hash; hash = raw_hashfunc(net, protocol); hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { iph = (const struct iphdr *)skb->data; if (!raw_v4_match(net, sk, iph->protocol, iph->daddr, iph->saddr, dif, sdif)) continue; raw_err(sk, skb, info); } rcu_read_unlock(); } static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; /* Charge it to the socket. */ ipv4_pktinfo_prepare(sk, skb, true); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { kfree_skb_reason(skb, reason); return NET_RX_DROP; } return NET_RX_SUCCESS; } int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); skb_push(skb, -skb_network_offset(skb)); raw_rcv_skb(sk, skb); return 0; } static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, struct msghdr *msg, size_t length, struct rtable **rtp, unsigned int flags, const struct sockcm_cookie *sockc) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct iphdr *iph; struct sk_buff *skb; unsigned int iphlen; int err; struct rtable *rt = *rtp; int hlen, tlen; if (length > rt->dst.dev->mtu) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, rt->dst.dev->mtu); return -EMSGSIZE; } if (length < sizeof(struct iphdr)) return -EINVAL; if (flags&MSG_PROBE) goto out; hlen = LL_RESERVED_SPACE(rt->dst.dev); tlen = rt->dst.dev->needed_tailroom; skb = sock_alloc_send_skb(sk, length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); if (!skb) goto error; skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IP); skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc->mark; skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; skb_reset_network_header(skb); iph = ip_hdr(skb); skb_put(skb, length); skb->ip_summed = CHECKSUM_NONE; skb_setup_tx_timestamp(skb, sockc->tsflags); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); skb->transport_header = skb->network_header; err = -EFAULT; if (memcpy_from_msg(iph, msg, length)) goto error_free; iphlen = iph->ihl * 4; /* * We don't want to modify the ip header, but we do need to * be sure that it won't cause problems later along the network * stack. Specifically we want to make sure that iph->ihl is a * sane value. If ihl points beyond the length of the buffer passed * in, reject the frame as invalid */ err = -EINVAL; if (iphlen > length) goto error_free; if (iphlen >= sizeof(*iph)) { if (!iph->saddr) iph->saddr = fl4->saddr; iph->check = 0; iph->tot_len = htons(length); if (!iph->id) ip_select_ident(net, skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); skb->transport_header += iphlen; if (iph->protocol == IPPROTO_ICMP && length >= iphlen + sizeof(struct icmphdr)) icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); } err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) goto error; out: return 0; error_free: kfree_skb(skb); error: IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; return err; } static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4) { int err; if (fl4->flowi4_proto != IPPROTO_ICMP) return 0; /* We only need the first two bytes. */ rfv->hlen = 2; err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen); if (err) return err; fl4->fl4_icmp_type = rfv->hdr.icmph.type; fl4->fl4_icmp_code = rfv->hdr.icmph.code; return 0; } static int raw_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct raw_frag_vec *rfv = from; if (offset < rfv->hlen) { int copy = min(rfv->hlen - offset, len); if (skb->ip_summed == CHECKSUM_PARTIAL) memcpy(to, rfv->hdr.c + offset, copy); else skb->csum = csum_block_add( skb->csum, csum_partial_copy_nocheck(rfv->hdr.c + offset, to, copy), odd); odd = 0; offset += copy; to += copy; len -= copy; if (!len) return 0; } offset -= rfv->hlen; return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; u8 tos, scope; int free = 0; __be32 daddr; __be32 saddr; int uc_index, err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; int hdrincl; err = -EMSGSIZE; if (len > 0xFFFF) goto out; hdrincl = inet_test_bit(HDRINCL, sk); /* * Check the flags. */ err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ goto out; /* compatibility */ /* * Get and verify the address. */ if (msg->msg_namelen) { DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); err = -EINVAL; if (msg->msg_namelen < sizeof(*usin)) goto out; if (usin->sin_family != AF_INET) { pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n", __func__, current->comm); err = -EAFNOSUPPORT; if (usin->sin_family) goto out; } daddr = usin->sin_addr.s_addr; /* ANK: I did not forget to get protocol from port field. * I just do not know, who uses this weirdness. * IP_HDRINCL is much more convenient. */ } else { err = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) goto out; daddr = inet->inet_daddr; } ipcm_init_sk(&ipc, inet); /* Keep backward compat */ if (hdrincl) ipc.protocol = IPPROTO_RAW; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); goto out; } if (ipc.opt) free = 1; } saddr = ipc.addr; ipc.addr = daddr; if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } if (ipc.opt) { err = -EINVAL; /* Linux does not mangle headers on raw sockets, * so that IP options + IP_HDRINCL is non-sense. */ if (hdrincl) goto done; if (ipc.opt->opt.srr) { if (!daddr) goto done; daddr = ipc.opt->opt.faddr; } } tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); } else if (!ipc.oif) { ipc.oif = uc_index; } else if (ipv4_is_lbcast(daddr) && uc_index) { /* oif is set, packet is to local broadcast * and uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. */ if (ipc.oif != uc_index && ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), uc_index)) { ipc.oif = uc_index; } } flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); fl4.fl4_icmp_type = 0; fl4.fl4_icmp_code = 0; if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; err = raw_probe_proto_opt(&rfv, &fl4); if (err) goto done; } security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto done; } err = -EACCES; if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) goto done; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: if (hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, &rt, msg->msg_flags, &ipc.sockc); else { if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); err = ip_append_data(sk, &fl4, raw_getfrag, &rfv, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) { err = ip_push_pending_frames(sk, &fl4); if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; } release_sock(sk); } done: if (free) kfree(ipc.opt); ip_rt_put(rt); out: if (err < 0) return err; return len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static void raw_close(struct sock *sk, long timeout) { /* * Raw sockets may have direct kernel references. Kill them. */ ip_ra_control(sk, 0, NULL); sk_common_release(sk); } static void raw_destroy(struct sock *sk) { lock_sock(sk); ip_flush_pending_frames(sk); release_sock(sk); } /* This gets rid of all the nasties in af_inet. -DaveM */ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; struct net *net = sock_net(sk); u32 tb_id = RT_TABLE_LOCAL; int ret = -EINVAL; int chk_addr_ret; lock_sock(sk); if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; if (sk->sk_bound_dev_if) tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); ret = -EADDRNOTAVAIL; if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr, chk_addr_ret)) goto out; inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); ret = 0; out: release_sock(sk); return ret; } /* * This should be easy, if there is something there * we return it, otherwise we block. */ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) { err = ip_recv_error(sk, msg, len, addr_len); goto out; } skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_cmsgs(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; } static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen) { if (optlen > sizeof(struct icmp_filter)) optlen = sizeof(struct icmp_filter); if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; } static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen) { int len, ret = -EFAULT; if (get_user(len, optlen)) goto out; ret = -EINVAL; if (len < 0) goto out; if (len > sizeof(struct icmp_filter)) len = sizeof(struct icmp_filter); ret = -EFAULT; if (put_user(len, optlen) || copy_to_user(optval, &raw_sk(sk)->filter, len)) goto out; ret = 0; out: return ret; } static int do_raw_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_seticmpfilter(sk, optval, optlen); } return -ENOPROTOOPT; } static int raw_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level != SOL_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); return do_raw_setsockopt(sk, optname, optval, optlen); } static int do_raw_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_geticmpfilter(sk, optval, optlen); } return -ENOPROTOOPT; } static int raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level != SOL_RAW) return ip_getsockopt(sk, level, optname, optval, optlen); return do_raw_getsockopt(sk, optname, optval, optlen); } static int raw_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { struct sk_buff *skb; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) *karg = skb->len; else *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; } default: #ifdef CONFIG_IP_MROUTE return ipmr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif } } #ifdef CONFIG_COMPAT static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { switch (cmd) { case SIOCOUTQ: case SIOCINQ: return -ENOIOCTLCMD; default: #ifdef CONFIG_IP_MROUTE return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg)); #else return -ENOIOCTLCMD; #endif } } #endif int raw_abort(struct sock *sk, int err) { lock_sock(sk); sk->sk_err = err; sk_error_report(sk); __udp_disconnect(sk, 0); release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(raw_abort); struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, .close = raw_close, .destroy = raw_destroy, .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .ioctl = raw_ioctl, .init = raw_sk_init, .setsockopt = raw_setsockopt, .getsockopt = raw_getsockopt, .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .bind = raw_bind, .backlog_rcv = raw_rcv_skb, .release_cb = ip4_datagram_release_cb, .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw_sock), .useroffset = offsetof(struct raw_sock, filter), .usersize = sizeof_field(struct raw_sock, filter), .h.raw_hash = &raw_v4_hashinfo, #ifdef CONFIG_COMPAT .compat_ioctl = compat_raw_ioctl, #endif .diag_destroy = raw_abort, }; #ifdef CONFIG_PROC_FS static struct sock *raw_get_first(struct seq_file *seq, int bucket) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); struct hlist_head *hlist; struct sock *sk; for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE; ++state->bucket) { hlist = &h->ht[state->bucket]; sk_for_each(sk, hlist) { if (sock_net(sk) == seq_file_net(seq)) return sk; } } return NULL; } static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) { struct raw_iter_state *state = raw_seq_private(seq); do { sk = sk_next(sk); } while (sk && sock_net(sk) != seq_file_net(seq)); if (!sk) return raw_get_first(seq, state->bucket + 1); return sk; } static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = raw_get_first(seq, 0); if (sk) while (pos && (sk = raw_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } void *raw_seq_start(struct seq_file *seq, loff_t *pos) __acquires(&h->lock) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); spin_lock(&h->lock); return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } EXPORT_SYMBOL_GPL(raw_seq_start); void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = raw_get_first(seq, 0); else sk = raw_get_next(seq, v); ++*pos; return sk; } EXPORT_SYMBOL_GPL(raw_seq_next); void raw_seq_stop(struct seq_file *seq, void *v) __releases(&h->lock) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_seq_stop); static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr, src = inet->inet_rcv_saddr; __u16 destp = 0, srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", i, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } static int raw_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops\n"); else raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); return 0; } static const struct seq_operations raw_seq_ops = { .start = raw_seq_start, .next = raw_seq_next, .stop = raw_seq_stop, .show = raw_seq_show, }; static __net_init int raw_init_net(struct net *net) { if (!proc_create_net_data("raw", 0444, net->proc_net, &raw_seq_ops, sizeof(struct raw_iter_state), &raw_v4_hashinfo)) return -ENOMEM; return 0; } static __net_exit void raw_exit_net(struct net *net) { remove_proc_entry("raw", net->proc_net); } static __net_initdata struct pernet_operations raw_net_ops = { .init = raw_init_net, .exit = raw_exit_net, }; int __init raw_proc_init(void) { return register_pernet_subsys(&raw_net_ops); } void __init raw_proc_exit(void) { unregister_pernet_subsys(&raw_net_ops); } #endif /* CONFIG_PROC_FS */ static void raw_sysctl_init_net(struct net *net) { #ifdef CONFIG_NET_L3_MASTER_DEV net->ipv4.sysctl_raw_l3mdev_accept = 1; #endif } static int __net_init raw_sysctl_init(struct net *net) { raw_sysctl_init_net(net); return 0; } static struct pernet_operations __net_initdata raw_sysctl_ops = { .init = raw_sysctl_init, }; void __init raw_init(void) { raw_sysctl_init_net(&init_net); if (register_pernet_subsys(&raw_sysctl_ops)) panic("RAW: failed to init sysctl parameters.\n"); } |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | // SPDX-License-Identifier: GPL-2.0-only /* * ebt_mark_m * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * July, 2002 * */ #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_mark_m.h> static bool ebt_mark_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ebt_mark_m_info *info = par->matchinfo; if (info->bitmask & EBT_MARK_OR) return !!(skb->mark & info->mask) ^ info->invert; return ((skb->mark & info->mask) == info->mark) ^ info->invert; } static int ebt_mark_mt_check(const struct xt_mtchk_param *par) { const struct ebt_mark_m_info *info = par->matchinfo; if (info->bitmask & ~EBT_MARK_MASK) return -EINVAL; if ((info->bitmask & EBT_MARK_OR) && (info->bitmask & EBT_MARK_AND)) return -EINVAL; if (!info->bitmask) return -EINVAL; return 0; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ebt_mark_m_info { compat_ulong_t mark, mask; uint8_t invert, bitmask; }; static void mark_mt_compat_from_user(void *dst, const void *src) { const struct compat_ebt_mark_m_info *user = src; struct ebt_mark_m_info *kern = dst; kern->mark = user->mark; kern->mask = user->mask; kern->invert = user->invert; kern->bitmask = user->bitmask; } static int mark_mt_compat_to_user(void __user *dst, const void *src) { struct compat_ebt_mark_m_info __user *user = dst; const struct ebt_mark_m_info *kern = src; if (put_user(kern->mark, &user->mark) || put_user(kern->mask, &user->mask) || put_user(kern->invert, &user->invert) || put_user(kern->bitmask, &user->bitmask)) return -EFAULT; return 0; } #endif static struct xt_match ebt_mark_mt_reg __read_mostly = { .name = "mark_m", .revision = 0, .family = NFPROTO_BRIDGE, .match = ebt_mark_mt, .checkentry = ebt_mark_mt_check, .matchsize = sizeof(struct ebt_mark_m_info), #ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_ebt_mark_m_info), .compat_from_user = mark_mt_compat_from_user, .compat_to_user = mark_mt_compat_to_user, #endif .me = THIS_MODULE, }; static int __init ebt_mark_m_init(void) { return xt_register_match(&ebt_mark_mt_reg); } static void __exit ebt_mark_m_fini(void) { xt_unregister_match(&ebt_mark_mt_reg); } module_init(ebt_mark_m_init); module_exit(ebt_mark_m_fini); MODULE_DESCRIPTION("Ebtables: Packet mark match"); MODULE_LICENSE("GPL"); |
49469 164 49455 49458 2 5 4 49432 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> #include <linux/ptrace.h> #include <asm/cpu_entry_area.h> #include <asm/switch_to.h> enum stack_type { STACK_TYPE_UNKNOWN, STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; struct stack_info { enum stack_type type; unsigned long *begin, *end, *next_sp; }; bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info); static __always_inline bool get_stack_guard_info(unsigned long *stack, struct stack_info *info) { /* make sure it's not in the stack proper */ if (get_stack_info_noinstr(stack, current, info)) return false; /* but if it is in the page below it, we hit a guard */ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info); } const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { void *begin = info->begin; void *end = info->end; return (info->type != STACK_TYPE_UNKNOWN && addr >= begin && addr < end && addr + len > begin && addr + len <= end); } #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else #define STACKSLOTS_PER_LINE 4 #endif #ifdef CONFIG_FRAME_POINTER static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->bp; if (task == current) return __builtin_frame_address(0); return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { return NULL; } #endif /* CONFIG_FRAME_POINTER */ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->sp; if (task == current) return __builtin_frame_address(0); return (unsigned long *)task->thread.sp; } /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; struct stack_frame_ia32 { u32 next_frame; u32 return_address; }; void show_opcodes(struct pt_regs *regs, const char *loglvl); void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */ |
6 1 1 1 2 1 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 | // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/if.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/tcp.h> #include <net/ip.h> #include <net/tcp.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> #include <linux/netfilter/nfnetlink_osf.h> /* * Indexed by dont-fragment bit. * It is the only constant value in the fingerprint. */ struct list_head nf_osf_fingers[2]; EXPORT_SYMBOL_GPL(nf_osf_fingers); static inline int nf_osf_ttl(const struct sk_buff *skb, int ttl_check, unsigned char f_ttl) { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); const struct in_ifaddr *ifa; int ret = 0; if (ttl_check == NF_OSF_TTL_TRUE) return ip->ttl == f_ttl; if (ttl_check == NF_OSF_TTL_NOCHECK) return 1; else if (ip->ttl <= f_ttl) return 1; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(ip->saddr, ifa)) { ret = (ip->ttl == f_ttl); break; } } return ret; } struct nf_osf_hdr_ctx { bool df; u16 window; u16 totlen; const unsigned char *optp; unsigned int optsize; }; static bool nf_osf_match_one(const struct sk_buff *skb, const struct nf_osf_user_finger *f, int ttl_check, struct nf_osf_hdr_ctx *ctx) { const __u8 *optpinit = ctx->optp; unsigned int check_WSS = 0; int fmatch = FMATCH_WRONG; int foptsize, optnum; u16 mss = 0; if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) return false; /* * Should not happen if userspace parser was written correctly. */ if (f->wss.wc >= OSF_WSS_MAX) return false; /* Check options */ foptsize = 0; for (optnum = 0; optnum < f->opt_num; ++optnum) foptsize += f->opt[optnum].length; if (foptsize > MAX_IPOPTLEN || ctx->optsize > MAX_IPOPTLEN || ctx->optsize != foptsize) return false; check_WSS = f->wss.wc; for (optnum = 0; optnum < f->opt_num; ++optnum) { if (f->opt[optnum].kind == *ctx->optp) { __u32 len = f->opt[optnum].length; const __u8 *optend = ctx->optp + len; fmatch = FMATCH_OK; switch (*ctx->optp) { case OSFOPT_MSS: mss = ctx->optp[3]; mss <<= 8; mss |= ctx->optp[2]; mss = ntohs((__force __be16)mss); break; case OSFOPT_TS: break; } ctx->optp = optend; } else fmatch = FMATCH_OPT_WRONG; if (fmatch != FMATCH_OK) break; } if (fmatch != FMATCH_OPT_WRONG) { fmatch = FMATCH_WRONG; switch (check_WSS) { case OSF_WSS_PLAIN: if (f->wss.val == 0 || ctx->window == f->wss.val) fmatch = FMATCH_OK; break; case OSF_WSS_MSS: /* * Some smart modems decrease mangle MSS to * SMART_MSS_2, so we check standard, decreased * and the one provided in the fingerprint MSS * values. */ #define SMART_MSS_1 1460 #define SMART_MSS_2 1448 if (ctx->window == f->wss.val * mss || ctx->window == f->wss.val * SMART_MSS_1 || ctx->window == f->wss.val * SMART_MSS_2) fmatch = FMATCH_OK; break; case OSF_WSS_MTU: if (ctx->window == f->wss.val * (mss + 40) || ctx->window == f->wss.val * (SMART_MSS_1 + 40) || ctx->window == f->wss.val * (SMART_MSS_2 + 40)) fmatch = FMATCH_OK; break; case OSF_WSS_MODULO: if ((ctx->window % f->wss.val) == 0) fmatch = FMATCH_OK; break; } } if (fmatch != FMATCH_OK) ctx->optp = optpinit; return fmatch == FMATCH_OK; } static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, const struct sk_buff *skb, const struct iphdr *ip, unsigned char *opts, struct tcphdr *_tcph) { const struct tcphdr *tcp; tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), _tcph); if (!tcp) return NULL; if (!tcp->syn) return NULL; ctx->totlen = ntohs(ip->tot_len); ctx->df = ntohs(ip->frag_off) & IP_DF; ctx->window = ntohs(tcp->window); if (tcp->doff * 4 > sizeof(struct tcphdr)) { ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr); ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + sizeof(struct tcphdr), ctx->optsize, opts); if (!ctx->optp) return NULL; } return tcp; } bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; int fcount = 0, ttl_check; int fmatch = FMATCH_WRONG; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : 0; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) continue; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; fmatch = FMATCH_OK; fcount++; if (info->flags & NF_OSF_LOG) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", f->genre, f->version, f->subtype, &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest), f->ttl - ip->ttl); if ((info->flags & NF_OSF_LOG) && info->loglevel == NF_OSF_LOGLEVEL_FIRST) break; } if (!fcount && (info->flags & NF_OSF_LOG)) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "Remote OS is not known: %pI4:%u -> %pI4:%u\n", &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest)); if (fcount) fmatch = FMATCH_OK; return fmatch == FMATCH_OK; } EXPORT_SYMBOL_GPL(nf_osf_match); bool nf_osf_find(const struct sk_buff *skb, const struct list_head *nf_osf_fingers, const int ttl_check, struct nf_osf_data *data) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; bool found = false; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; data->genre = f->genre; data->version = f->version; found = true; break; } return found; } EXPORT_SYMBOL_GPL(nf_osf_find); static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) }, }; static int nfnl_osf_add_callback(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *kf = NULL, *sf; int err = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); if (f->opt_num > ARRAY_SIZE(f->opt)) return -EINVAL; if (!memchr(f->genre, 0, MAXGENRELEN) || !memchr(f->subtype, 0, MAXGENRELEN) || !memchr(f->version, 0, MAXGENRELEN)) return -EINVAL; kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); if (!kf) return -ENOMEM; memcpy(&kf->finger, f, sizeof(struct nf_osf_user_finger)); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; kfree(kf); kf = NULL; if (info->nlh->nlmsg_flags & NLM_F_EXCL) err = -EEXIST; break; } /* * We are protected by nfnl mutex. */ if (kf) list_add_tail_rcu(&kf->finger_entry, &nf_osf_fingers[!!f->df]); return err; } static int nfnl_osf_remove_callback(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *sf; int err = -ENOENT; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; /* * We are protected by nfnl mutex. */ list_del_rcu(&sf->finger_entry); kfree_rcu(sf, rcu_head); err = 0; break; } return err; } static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = { [OSF_MSG_ADD] = { .call = nfnl_osf_add_callback, .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, [OSF_MSG_REMOVE] = { .call = nfnl_osf_remove_callback, .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, }; static const struct nfnetlink_subsystem nfnl_osf_subsys = { .name = "osf", .subsys_id = NFNL_SUBSYS_OSF, .cb_count = OSF_MSG_MAX, .cb = nfnl_osf_callbacks, }; static int __init nfnl_osf_init(void) { int err = -EINVAL; int i; for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) INIT_LIST_HEAD(&nf_osf_fingers[i]); err = nfnetlink_subsys_register(&nfnl_osf_subsys); if (err < 0) { pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err); goto err_out_exit; } return 0; err_out_exit: return err; } static void __exit nfnl_osf_fini(void) { struct nf_osf_finger *f; int i; nfnetlink_subsys_unregister(&nfnl_osf_subsys); rcu_read_lock(); for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) { list_for_each_entry_rcu(f, &nf_osf_fingers[i], finger_entry) { list_del_rcu(&f->finger_entry); kfree_rcu(f, rcu_head); } } rcu_read_unlock(); rcu_barrier(); } module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Passive OS fingerprint matching"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); |
4 4 4 4 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 | // SPDX-License-Identifier: GPL-2.0 /* * attribute_container.c - implementation of a simple container for classes * * Copyright (c) 2005 - James Bottomley <James.Bottomley@steeleye.com> * * The basic idea here is to enable a device to be attached to an * aritrary numer of classes without having to allocate storage for them. * Instead, the contained classes select the devices they need to attach * to via a matching function. */ #include <linux/attribute_container.h> #include <linux/device.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/module.h> #include <linux/mutex.h> #include "base.h" /* This is a private structure used to tie the classdev and the * container .. it should never be visible outside this file */ struct internal_container { struct klist_node node; struct attribute_container *cont; struct device classdev; }; static void internal_container_klist_get(struct klist_node *n) { struct internal_container *ic = container_of(n, struct internal_container, node); get_device(&ic->classdev); } static void internal_container_klist_put(struct klist_node *n) { struct internal_container *ic = container_of(n, struct internal_container, node); put_device(&ic->classdev); } /** * attribute_container_classdev_to_container - given a classdev, return the container * * @classdev: the class device created by attribute_container_add_device. * * Returns the container associated with this classdev. */ struct attribute_container * attribute_container_classdev_to_container(struct device *classdev) { struct internal_container *ic = container_of(classdev, struct internal_container, classdev); return ic->cont; } EXPORT_SYMBOL_GPL(attribute_container_classdev_to_container); static LIST_HEAD(attribute_container_list); static DEFINE_MUTEX(attribute_container_mutex); /** * attribute_container_register - register an attribute container * * @cont: The container to register. This must be allocated by the * callee and should also be zeroed by it. */ int attribute_container_register(struct attribute_container *cont) { INIT_LIST_HEAD(&cont->node); klist_init(&cont->containers, internal_container_klist_get, internal_container_klist_put); mutex_lock(&attribute_container_mutex); list_add_tail(&cont->node, &attribute_container_list); mutex_unlock(&attribute_container_mutex); return 0; } EXPORT_SYMBOL_GPL(attribute_container_register); /** * attribute_container_unregister - remove a container registration * * @cont: previously registered container to remove */ int attribute_container_unregister(struct attribute_container *cont) { int retval = -EBUSY; mutex_lock(&attribute_container_mutex); spin_lock(&cont->containers.k_lock); if (!list_empty(&cont->containers.k_list)) goto out; retval = 0; list_del(&cont->node); out: spin_unlock(&cont->containers.k_lock); mutex_unlock(&attribute_container_mutex); return retval; } EXPORT_SYMBOL_GPL(attribute_container_unregister); /* private function used as class release */ static void attribute_container_release(struct device *classdev) { struct internal_container *ic = container_of(classdev, struct internal_container, classdev); struct device *dev = classdev->parent; kfree(ic); put_device(dev); } /** * attribute_container_add_device - see if any container is interested in dev * * @dev: device to add attributes to * @fn: function to trigger addition of class device. * * This function allocates storage for the class device(s) to be * attached to dev (one for each matching attribute_container). If no * fn is provided, the code will simply register the class device via * device_add. If a function is provided, it is expected to add * the class device at the appropriate time. One of the things that * might be necessary is to allocate and initialise the classdev and * then add it a later time. To do this, call this routine for * allocation and initialisation and then use * attribute_container_device_trigger() to call device_add() on * it. Note: after this, the class device contains a reference to dev * which is not relinquished until the release of the classdev. */ void attribute_container_add_device(struct device *dev, int (*fn)(struct attribute_container *, struct device *, struct device *)) { struct attribute_container *cont; mutex_lock(&attribute_container_mutex); list_for_each_entry(cont, &attribute_container_list, node) { struct internal_container *ic; if (attribute_container_no_classdevs(cont)) continue; if (!cont->match(cont, dev)) continue; ic = kzalloc(sizeof(*ic), GFP_KERNEL); if (!ic) { dev_err(dev, "failed to allocate class container\n"); continue; } ic->cont = cont; device_initialize(&ic->classdev); ic->classdev.parent = get_device(dev); ic->classdev.class = cont->class; cont->class->dev_release = attribute_container_release; dev_set_name(&ic->classdev, "%s", dev_name(dev)); if (fn) fn(cont, dev, &ic->classdev); else attribute_container_add_class_device(&ic->classdev); klist_add_tail(&ic->node, &cont->containers); } mutex_unlock(&attribute_container_mutex); } /* FIXME: can't break out of this unless klist_iter_exit is also * called before doing the break */ #define klist_for_each_entry(pos, head, member, iter) \ for (klist_iter_init(head, iter); (pos = ({ \ struct klist_node *n = klist_next(iter); \ n ? container_of(n, typeof(*pos), member) : \ ({ klist_iter_exit(iter) ; NULL; }); \ })) != NULL;) /** * attribute_container_remove_device - make device eligible for removal. * * @dev: The generic device * @fn: A function to call to remove the device * * This routine triggers device removal. If fn is NULL, then it is * simply done via device_unregister (note that if something * still has a reference to the classdev, then the memory occupied * will not be freed until the classdev is released). If you want a * two phase release: remove from visibility and then delete the * device, then you should use this routine with a fn that calls * device_del() and then use attribute_container_device_trigger() * to do the final put on the classdev. */ void attribute_container_remove_device(struct device *dev, void (*fn)(struct attribute_container *, struct device *, struct device *)) { struct attribute_container *cont; mutex_lock(&attribute_container_mutex); list_for_each_entry(cont, &attribute_container_list, node) { struct internal_container *ic; struct klist_iter iter; if (attribute_container_no_classdevs(cont)) continue; if (!cont->match(cont, dev)) continue; klist_for_each_entry(ic, &cont->containers, node, &iter) { if (dev != ic->classdev.parent) continue; klist_del(&ic->node); if (fn) fn(cont, dev, &ic->classdev); else { attribute_container_remove_attrs(&ic->classdev); device_unregister(&ic->classdev); } } } mutex_unlock(&attribute_container_mutex); } static int do_attribute_container_device_trigger_safe(struct device *dev, struct attribute_container *cont, int (*fn)(struct attribute_container *, struct device *, struct device *), int (*undo)(struct attribute_container *, struct device *, struct device *)) { int ret; struct internal_container *ic, *failed; struct klist_iter iter; if (attribute_container_no_classdevs(cont)) return fn(cont, dev, NULL); klist_for_each_entry(ic, &cont->containers, node, &iter) { if (dev == ic->classdev.parent) { ret = fn(cont, dev, &ic->classdev); if (ret) { failed = ic; klist_iter_exit(&iter); goto fail; } } } return 0; fail: if (!undo) return ret; /* Attempt to undo the work partially done. */ klist_for_each_entry(ic, &cont->containers, node, &iter) { if (ic == failed) { klist_iter_exit(&iter); break; } if (dev == ic->classdev.parent) undo(cont, dev, &ic->classdev); } return ret; } /** * attribute_container_device_trigger_safe - execute a trigger for each * matching classdev or fail all of them. * * @dev: The generic device to run the trigger for * @fn: the function to execute for each classdev. * @undo: A function to undo the work previously done in case of error * * This function is a safe version of * attribute_container_device_trigger. It stops on the first error and * undo the partial work that has been done, on previous classdev. It * is guaranteed that either they all succeeded, or none of them * succeeded. */ int attribute_container_device_trigger_safe(struct device *dev, int (*fn)(struct attribute_container *, struct device *, struct device *), int (*undo)(struct attribute_container *, struct device *, struct device *)) { struct attribute_container *cont, *failed = NULL; int ret = 0; mutex_lock(&attribute_container_mutex); list_for_each_entry(cont, &attribute_container_list, node) { if (!cont->match(cont, dev)) continue; ret = do_attribute_container_device_trigger_safe(dev, cont, fn, undo); if (ret) { failed = cont; break; } } if (ret && !WARN_ON(!undo)) { list_for_each_entry(cont, &attribute_container_list, node) { if (failed == cont) break; if (!cont->match(cont, dev)) continue; do_attribute_container_device_trigger_safe(dev, cont, undo, NULL); } } mutex_unlock(&attribute_container_mutex); return ret; } /** * attribute_container_device_trigger - execute a trigger for each matching classdev * * @dev: The generic device to run the trigger for * @fn: the function to execute for each classdev. * * This function is for executing a trigger when you need to know both * the container and the classdev. If you only care about the * container, then use attribute_container_trigger() instead. */ void attribute_container_device_trigger(struct device *dev, int (*fn)(struct attribute_container *, struct device *, struct device *)) { struct attribute_container *cont; mutex_lock(&attribute_container_mutex); list_for_each_entry(cont, &attribute_container_list, node) { struct internal_container *ic; struct klist_iter iter; if (!cont->match(cont, dev)) continue; if (attribute_container_no_classdevs(cont)) { fn(cont, dev, NULL); continue; } klist_for_each_entry(ic, &cont->containers, node, &iter) { if (dev == ic->classdev.parent) fn(cont, dev, &ic->classdev); } } mutex_unlock(&attribute_container_mutex); } /** * attribute_container_trigger - trigger a function for each matching container * * @dev: The generic device to activate the trigger for * @fn: the function to trigger * * This routine triggers a function that only needs to know the * matching containers (not the classdev) associated with a device. * It is more lightweight than attribute_container_device_trigger, so * should be used in preference unless the triggering function * actually needs to know the classdev. */ void attribute_container_trigger(struct device *dev, int (*fn)(struct attribute_container *, struct device *)) { struct attribute_container *cont; mutex_lock(&attribute_container_mutex); list_for_each_entry(cont, &attribute_container_list, node) { if (cont->match(cont, dev)) fn(cont, dev); } mutex_unlock(&attribute_container_mutex); } /** * attribute_container_add_attrs - add attributes * * @classdev: The class device * * This simply creates all the class device sysfs files from the * attributes listed in the container */ int attribute_container_add_attrs(struct device *classdev) { struct attribute_container *cont = attribute_container_classdev_to_container(classdev); struct device_attribute **attrs = cont->attrs; int i, error; BUG_ON(attrs && cont->grp); if (!attrs && !cont->grp) return 0; if (cont->grp) return sysfs_create_group(&classdev->kobj, cont->grp); for (i = 0; attrs[i]; i++) { sysfs_attr_init(&attrs[i]->attr); error = device_create_file(classdev, attrs[i]); if (error) return error; } return 0; } /** * attribute_container_add_class_device - same function as device_add * * @classdev: the class device to add * * This performs essentially the same function as device_add except for * attribute containers, namely add the classdev to the system and then * create the attribute files */ int attribute_container_add_class_device(struct device *classdev) { int error = device_add(classdev); if (error) return error; return attribute_container_add_attrs(classdev); } /** * attribute_container_add_class_device_adapter - simple adapter for triggers * * @cont: the container to register. * @dev: the generic device to activate the trigger for * @classdev: the class device to add * * This function is identical to attribute_container_add_class_device except * that it is designed to be called from the triggers */ int attribute_container_add_class_device_adapter(struct attribute_container *cont, struct device *dev, struct device *classdev) { return attribute_container_add_class_device(classdev); } /** * attribute_container_remove_attrs - remove any attribute files * * @classdev: The class device to remove the files from * */ void attribute_container_remove_attrs(struct device *classdev) { struct attribute_container *cont = attribute_container_classdev_to_container(classdev); struct device_attribute **attrs = cont->attrs; int i; if (!attrs && !cont->grp) return; if (cont->grp) { sysfs_remove_group(&classdev->kobj, cont->grp); return ; } for (i = 0; attrs[i]; i++) device_remove_file(classdev, attrs[i]); } /** * attribute_container_class_device_del - equivalent of class_device_del * * @classdev: the class device * * This function simply removes all the attribute files and then calls * device_del. */ void attribute_container_class_device_del(struct device *classdev) { attribute_container_remove_attrs(classdev); device_del(classdev); } /** * attribute_container_find_class_device - find the corresponding class_device * * @cont: the container * @dev: the generic device * * Looks up the device in the container's list of class devices and returns * the corresponding class_device. */ struct device * attribute_container_find_class_device(struct attribute_container *cont, struct device *dev) { struct device *cdev = NULL; struct internal_container *ic; struct klist_iter iter; klist_for_each_entry(ic, &cont->containers, node, &iter) { if (ic->classdev.parent == dev) { cdev = &ic->classdev; /* FIXME: must exit iterator then break */ klist_iter_exit(&iter); break; } } return cdev; } EXPORT_SYMBOL_GPL(attribute_container_find_class_device); |
1 557 8 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner */ #include "bat_v.h" #include "main.h" #include <linux/atomic.h> #include <linux/cache.h> #include <linux/errno.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bat_v_elp.h" #include "bat_v_ogm.h" #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "originator.h" static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if) { batadv_v_elp_iface_activate(primary_if, hard_iface); batadv_hardif_put(primary_if); } /* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can * set the interface as ACTIVE right away, without any risk of race * condition */ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) hard_iface->if_status = BATADV_IF_ACTIVE; } static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface) { int ret; ret = batadv_v_elp_iface_enable(hard_iface); if (ret < 0) return ret; ret = batadv_v_ogm_iface_enable(hard_iface); if (ret < 0) batadv_v_elp_iface_disable(hard_iface); return ret; } static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface) { batadv_v_ogm_iface_disable(hard_iface); batadv_v_elp_iface_disable(hard_iface); } static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface) { batadv_v_elp_primary_iface_set(hard_iface); batadv_v_ogm_primary_iface_set(hard_iface); } /** * batadv_v_iface_update_mac() - react to hard-interface MAC address change * @hard_iface: the modified interface * * If the modified interface is the primary one, update the originator * address in the ELP and OGM messages to reflect the new MAC address. */ static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if != hard_iface) goto out; batadv_v_primary_iface_set(hard_iface); out: batadv_hardif_put(primary_if); } static void batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) { ewma_throughput_init(&hardif_neigh->bat_v.throughput); INIT_WORK(&hardif_neigh->bat_v.metric_work, batadv_v_elp_throughput_metric_update); } /** * batadv_v_neigh_dump_neigh() - Dump a neighbour into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @hardif_neigh: Neighbour to dump * * Return: Error code, or 0 on success */ static int batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_hardif_neigh_node *hardif_neigh) { void *hdr; unsigned int last_seen_msecs; u32 throughput; last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); throughput = throughput * 100; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs) || nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_v_neigh_dump_hardif() - Dump the neighbours of a hard interface into * a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @hard_iface: The hard interface to be dumped * @idx_s: Entries to be skipped * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface, int *idx_s) { struct batadv_hardif_neigh_node *hardif_neigh; int idx = 0; hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) { if (idx++ < *idx_s) continue; if (batadv_v_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) { *idx_s = idx - 1; return -EMSGSIZE; } } *idx_s = 0; return 0; } /** * batadv_v_neigh_dump() - Dump the neighbours of a hard interface into a * message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @single_hardif: Limit dumping to this hard interface */ static void batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *single_hardif) { struct batadv_hard_iface *hard_iface; int i_hardif = 0; int i_hardif_s = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; rcu_read_lock(); if (single_hardif) { if (i_hardif_s == 0) { if (batadv_v_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, single_hardif, &idx) == 0) i_hardif++; } } else { list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (i_hardif++ < i_hardif_s) continue; if (batadv_v_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, hard_iface, &idx)) { i_hardif--; break; } } } rcu_read_unlock(); cb->args[0] = i_hardif; cb->args[1] = idx; } /** * batadv_v_orig_dump_subentry() - Dump an originator subentry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @neigh_node: Single hops neighbour * @best: Is the best originator * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, bool best) { struct batadv_neigh_ifinfo *n_ifinfo; unsigned int last_seen_msecs; u32 throughput; void *hdr; n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!n_ifinfo) return 0; throughput = n_ifinfo->bat_v.throughput * 100; batadv_neigh_ifinfo_put(n_ifinfo); last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); if (if_outgoing != BATADV_IF_DEFAULT && if_outgoing != neigh_node->if_incoming) return 0; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_v_orig_dump_entry() - Dump an originator entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @sub_s: Number of sub entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, int *sub_s) { struct batadv_neigh_node *neigh_node_best; struct batadv_neigh_node *neigh_node; int sub = 0; bool best; neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); if (!neigh_node_best) goto out; hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { if (sub++ < *sub_s) continue; best = (neigh_node == neigh_node_best); if (batadv_v_orig_dump_subentry(msg, portid, seq, bat_priv, if_outgoing, orig_node, neigh_node, best)) { batadv_neigh_node_put(neigh_node_best); *sub_s = sub - 1; return -EMSGSIZE; } } out: batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; } /** * batadv_v_orig_dump_bucket() - Dump an originator bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @head: Bucket to be dumped * @idx_s: Number of entries to be skipped * @sub: Number of sub entries to be skipped * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct hlist_head *head, int *idx_s, int *sub) { struct batadv_orig_node *orig_node; int idx = 0; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (idx++ < *idx_s) continue; if (batadv_v_orig_dump_entry(msg, portid, seq, bat_priv, if_outgoing, orig_node, sub)) { rcu_read_unlock(); *idx_s = idx - 1; return -EMSGSIZE; } } rcu_read_unlock(); *idx_s = 0; *sub = 0; return 0; } /** * batadv_v_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface */ static void batadv_v_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; while (bucket < hash->size) { head = &hash->table[bucket]; if (batadv_v_orig_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, bat_priv, if_outgoing, head, &idx, &sub)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; cb->args[2] = sub; } static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; int ret = 0; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); if (!ifinfo1) goto err_ifinfo1; ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!ifinfo2) goto err_ifinfo2; ret = ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput; batadv_neigh_ifinfo_put(ifinfo2); err_ifinfo2: batadv_neigh_ifinfo_put(ifinfo1); err_ifinfo1: return ret; } static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; u32 threshold; bool ret = false; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); if (!ifinfo1) goto err_ifinfo1; ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!ifinfo2) goto err_ifinfo2; threshold = ifinfo1->bat_v.throughput / 4; threshold = ifinfo1->bat_v.throughput - threshold; ret = ifinfo2->bat_v.throughput > threshold; batadv_neigh_ifinfo_put(ifinfo2); err_ifinfo2: batadv_neigh_ifinfo_put(ifinfo1); err_ifinfo1: return ret; } /** * batadv_v_init_sel_class() - initialize GW selection class * @bat_priv: the bat priv with all the soft interface information */ static void batadv_v_init_sel_class(struct batadv_priv *bat_priv) { /* set default throughput difference threshold to 5Mbps */ atomic_set(&bat_priv->gw.sel_class, 50); } /** * batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW * @gw_node: the GW to retrieve the metric for * @bw: the pointer where the metric will be stored. The metric is computed as * the minimum between the GW advertised throughput and the path throughput to * it in the mesh * * Return: 0 on success, -1 on failure */ static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_orig_node *orig_node; struct batadv_neigh_node *router; int ret = -1; orig_node = gw_node->orig_node; router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; /* the GW metric is computed as the minimum between the path throughput * to reach the GW itself and the advertised bandwidth. * This gives us an approximation of the effective throughput that the * client can expect via this particular GW node */ *bw = router_ifinfo->bat_v.throughput; *bw = min_t(u32, *bw, gw_node->bandwidth_down); ret = 0; out: batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(router_ifinfo); return ret; } /** * batadv_v_gw_get_best_gw_node() - retrieve the best GW node * @bat_priv: the bat priv with all the soft interface information * * Return: the GW node having the best GW-metric, NULL if no GW is known */ static struct batadv_gw_node * batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node, *curr_gw = NULL; u32 max_bw = 0, bw; rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { if (!kref_get_unless_zero(&gw_node->refcount)) continue; if (batadv_v_gw_throughput_get(gw_node, &bw) < 0) goto next; if (curr_gw && bw <= max_bw) goto next; batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); max_bw = bw; next: batadv_gw_node_put(gw_node); } rcu_read_unlock(); return curr_gw; } /** * batadv_v_gw_is_eligible() - check if a originator would be selected as GW * @bat_priv: the bat priv with all the soft interface information * @curr_gw_orig: originator representing the currently selected GW * @orig_node: the originator representing the new candidate * * Return: true if orig_node can be selected as current GW, false otherwise */ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, struct batadv_orig_node *curr_gw_orig, struct batadv_orig_node *orig_node) { struct batadv_gw_node *curr_gw, *orig_gw = NULL; u32 gw_throughput, orig_throughput, threshold; bool ret = false; threshold = atomic_read(&bat_priv->gw.sel_class); curr_gw = batadv_gw_node_get(bat_priv, curr_gw_orig); if (!curr_gw) { ret = true; goto out; } if (batadv_v_gw_throughput_get(curr_gw, &gw_throughput) < 0) { ret = true; goto out; } orig_gw = batadv_gw_node_get(bat_priv, orig_node); if (!orig_gw) goto out; if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0) goto out; if (orig_throughput < gw_throughput) goto out; if ((orig_throughput - gw_throughput) < threshold) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Restarting gateway selection: better gateway found (throughput curr: %u, throughput new: %u)\n", gw_throughput, orig_throughput); ret = true; out: batadv_gw_node_put(curr_gw); batadv_gw_node_put(orig_gw); return ret; } /** * batadv_v_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); ret = -EMSGSIZE; if (curr_gw == gw_node) { if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { genlmsg_cancel(msg, hdr); goto out; } } if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, gw_node->orig_node->orig)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, router_ifinfo->bat_v.throughput)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, router->if_incoming->net_dev->name)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, router->if_incoming->net_dev->ifindex)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: batadv_gw_node_put(curr_gw); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); return ret; } /** * batadv_v_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information */ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_gw_node *gw_node; int idx_skip = cb->args[0]; int idx = 0; spin_lock_bh(&bat_priv->gw.list_lock); cb->seq = bat_priv->gw.generation << 1 | 1; hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; if (batadv_v_gw_dump_entry(msg, portid, cb, bat_priv, gw_node)) { idx_skip = idx - 1; goto unlock; } } idx_skip = idx; unlock: spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } static struct batadv_algo_ops batadv_batman_v __read_mostly = { .name = "BATMAN_V", .iface = { .activate = batadv_v_iface_activate, .enable = batadv_v_iface_enable, .disable = batadv_v_iface_disable, .update_mac = batadv_v_iface_update_mac, .primary_set = batadv_v_primary_iface_set, }, .neigh = { .hardif_init = batadv_v_hardif_neigh_init, .cmp = batadv_v_neigh_cmp, .is_similar_or_better = batadv_v_neigh_is_sob, .dump = batadv_v_neigh_dump, }, .orig = { .dump = batadv_v_orig_dump, }, .gw = { .init_sel_class = batadv_v_init_sel_class, .sel_class_max = U32_MAX, .get_best_gw_node = batadv_v_gw_get_best_gw_node, .is_eligible = batadv_v_gw_is_eligible, .dump = batadv_v_gw_dump, }, }; /** * batadv_v_hardif_init() - initialize the algorithm specific fields in the * hard-interface object * @hard_iface: the hard-interface to initialize */ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) { /* enable link throughput auto-detection by setting the throughput * override to zero */ atomic_set(&hard_iface->bat_v.throughput_override, 0); atomic_set(&hard_iface->bat_v.elp_interval, 500); hard_iface->bat_v.aggr_len = 0; skb_queue_head_init(&hard_iface->bat_v.aggr_list); INIT_DELAYED_WORK(&hard_iface->bat_v.aggr_wq, batadv_v_ogm_aggr_work); } /** * batadv_v_mesh_init() - initialize the B.A.T.M.A.N. V private resources for a * mesh * @bat_priv: the object representing the mesh interface to initialise * * Return: 0 on success or a negative error code otherwise */ int batadv_v_mesh_init(struct batadv_priv *bat_priv) { int ret = 0; ret = batadv_v_ogm_init(bat_priv); if (ret < 0) return ret; return 0; } /** * batadv_v_mesh_free() - free the B.A.T.M.A.N. V private resources for a mesh * @bat_priv: the object representing the mesh interface to free */ void batadv_v_mesh_free(struct batadv_priv *bat_priv) { batadv_v_ogm_free(bat_priv); } /** * batadv_v_init() - B.A.T.M.A.N. V initialization function * * Description: Takes care of initializing all the subcomponents. * It is invoked upon module load only. * * Return: 0 on success or a negative error code otherwise */ int __init batadv_v_init(void) { int ret; /* B.A.T.M.A.N. V echo location protocol packet */ ret = batadv_recv_handler_register(BATADV_ELP, batadv_v_elp_packet_recv); if (ret < 0) return ret; ret = batadv_recv_handler_register(BATADV_OGM2, batadv_v_ogm_packet_recv); if (ret < 0) goto elp_unregister; ret = batadv_algo_register(&batadv_batman_v); if (ret < 0) goto ogm_unregister; return ret; ogm_unregister: batadv_recv_handler_unregister(BATADV_OGM2); elp_unregister: batadv_recv_handler_unregister(BATADV_ELP); return ret; } |
2208 1987 1434 2611 2617 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_RCUREF_H #define _LINUX_RCUREF_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/preempt.h> #include <linux/rcupdate.h> #define RCUREF_ONEREF 0x00000000U #define RCUREF_MAXREF 0x7FFFFFFFU #define RCUREF_SATURATED 0xA0000000U #define RCUREF_RELEASED 0xC0000000U #define RCUREF_DEAD 0xE0000000U #define RCUREF_NOREF 0xFFFFFFFFU /** * rcuref_init - Initialize a rcuref reference count with the given reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) { atomic_set(&ref->refcnt, cnt - 1); } /** * rcuref_read - Read the number of held reference counts of a rcuref * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N) */ static inline unsigned int rcuref_read(rcuref_t *ref) { unsigned int c = atomic_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= RCUREF_RELEASED ? 0 : c + 1; } extern __must_check bool rcuref_get_slowpath(rcuref_t *ref); /** * rcuref_get - Acquire one reference on a rcuref reference count * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See documentation in lib/rcuref.c * * Return: * False if the attempt to acquire a reference failed. This happens * when the last reference has been put already * * True if a reference was successfully acquired */ static inline __must_check bool rcuref_get(rcuref_t *ref) { /* * Unconditionally increase the reference count. The saturation and * dead zones provide enough tolerance for this. */ if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt))) return true; /* Handle the cases inside the saturation and dead zones */ return rcuref_get_slowpath(ref); } extern __must_check bool rcuref_put_slowpath(rcuref_t *ref); /* * Internal helper. Do not invoke directly. */ static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) { RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. */ if (likely(!atomic_add_negative_release(-1, &ref->refcnt))) return false; /* * Handle the last reference drop and cases inside the saturation * and dead zones. */ return rcuref_put_slowpath(ref); } /** * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Can be invoked from contexts, which guarantee that no grace period can * happen which would free the object concurrently if the decrement drops * the last reference and the slowpath races against a concurrent get() and * put() pair. rcu_read_lock()'ed and atomic contexts qualify. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely release the * object which is protected by the reference counter. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref) { return __rcuref_put(ref); } /** * rcuref_put -- Release one reference for a rcuref reference count * @ref: Pointer to the reference count * * Can be invoked from any context. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: * * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ static inline __must_check bool rcuref_put(rcuref_t *ref) { bool released; preempt_disable(); released = __rcuref_put(ref); preempt_enable(); return released; } #endif |
3 3 3258 788 2552 430 2959 3257 3260 3258 2 2 2 2 3257 297 296 40 40 40 40 40 40 12 12 12 9 723 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 | // SPDX-License-Identifier: GPL-2.0 /* * Tag allocation using scalable bitmaps. Uses active queue tracking to support * fairer distribution of tags between multiple submitters when a shared tag map * is used. * * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/delay.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" /* * Recalculate wakeup batch when tag is shared by hctx. */ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, unsigned int users) { if (!users) return; sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, users); sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, users); } /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail * to get tag when first time, the other shared-tag users could reserve * budget for it. */ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; struct blk_mq_tags *tags = hctx->tags; /* * calling test_bit() prior to test_and_set_bit() is intentional, * it avoids dirtying the cacheline if the queue is already active. */ if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; } else { if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; } spin_lock_irq(&tags->lock); users = tags->active_queues + 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); spin_unlock_irq(&tags->lock); } /* * Wakeup all potentially sleeping on tags */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { sbitmap_queue_wake_all(&tags->bitmap_tags); if (include_reserve) sbitmap_queue_wake_all(&tags->breserved_tags); } /* * If a previously busy queue goes inactive, potential waiters could now * be allowed to queue. Wake them up and check. */ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; } else { if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; } spin_lock_irq(&tags->lock); users = tags->active_queues - 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); spin_unlock_irq(&tags->lock); blk_mq_tag_wakeup_all(tags, false); } static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && !hctx_may_queue(data->hctx, bt)) return BLK_MQ_NO_TAG; if (data->shallow_depth) return sbitmap_queue_get_shallow(bt, data->shallow_depth); else return __sbitmap_queue_get(bt); } unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt = &tags->bitmap_tags; unsigned long ret; if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED || data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) return 0; ret = __sbitmap_queue_get_batch(bt, nr_tags, offset); *offset += tags->nr_reserved_tags; return ret; } unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt; struct sbq_wait_state *ws; DEFINE_SBQ_WAIT(wait); unsigned int tag_offset; int tag; if (data->flags & BLK_MQ_REQ_RESERVED) { if (unlikely(!tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_NO_TAG; } bt = &tags->breserved_tags; tag_offset = 0; } else { bt = &tags->bitmap_tags; tag_offset = tags->nr_reserved_tags; } tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) goto found_tag; if (data->flags & BLK_MQ_REQ_NOWAIT) return BLK_MQ_NO_TAG; ws = bt_wait_ptr(bt, data->hctx); do { struct sbitmap_queue *bt_prev; /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for * some to complete. */ blk_mq_run_hw_queue(data->hctx, false); /* * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) break; sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) break; bt_prev = bt; io_schedule(); sbitmap_finish_wait(bt, ws, &wait); data->ctx = blk_mq_get_ctx(data->q); data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = &tags->breserved_tags; else bt = &tags->bitmap_tags; /* * If destination hw queue is changed, fake wake up on * previous queue for compensating the wake up miss, so * other allocations on previous queue won't be starved. */ if (bt != bt_prev) sbitmap_queue_wake_up(bt_prev, 1); ws = bt_wait_ptr(bt, data->hctx); } while (1); sbitmap_finish_wait(bt, ws, &wait); found_tag: /* * Give up this allocation if the hctx is inactive. The caller will * retry on an active hctx. */ if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { blk_mq_put_tag(tags, data->ctx, tag + tag_offset); return BLK_MQ_NO_TAG; } return tag + tag_offset; } void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag) { if (!blk_mq_tag_is_reserved(tags, tag)) { const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } } void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags) { sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags, tag_array, nr_tags); } struct bt_iter_data { struct blk_mq_hw_ctx *hctx; struct request_queue *q; busy_tag_iter_fn *fn; void *data; bool reserved; }; static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { struct request *rq; unsigned long flags; spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq)) rq = NULL; spin_unlock_irqrestore(&tags->lock, flags); return rq; } static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; struct request_queue *q = iter_data->q; struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_tags *tags; struct request *rq; bool ret = true; if (blk_mq_is_shared_tags(set->flags)) tags = set->shared_tags; else tags = hctx->tags; if (!iter_data->reserved) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; if (rq->q == q && (!hctx || rq->mq_hctx == hctx)) ret = iter_data->fn(rq, iter_data->data); blk_mq_put_rq_ref(rq); return ret; } /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. * @q: Request queue to examine. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request * associated with @hctx that has been assigned a driver tag. * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) * where rq is a pointer to a request. Return true to continue * iterating tags, false to stop. * @data: Will be passed as third argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q, struct sbitmap_queue *bt, busy_tag_iter_fn *fn, void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, .q = q, }; sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); } struct bt_tags_iter_data { struct blk_mq_tags *tags; busy_tag_iter_fn *fn; void *data; unsigned int flags; }; #define BT_TAG_ITER_RESERVED (1 << 0) #define BT_TAG_ITER_STARTED (1 << 1) #define BT_TAG_ITER_STATIC_RQS (1 << 2) static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_tags_iter_data *iter_data = data; struct blk_mq_tags *tags = iter_data->tags; struct request *rq; bool ret = true; bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); if (!(iter_data->flags & BT_TAG_ITER_RESERVED)) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ if (iter_static_rqs) rq = tags->static_rqs[bitnr]; else rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; if (!(iter_data->flags & BT_TAG_ITER_STARTED) || blk_mq_request_started(rq)) ret = iter_data->fn(rq, iter_data->data); if (!iter_static_rqs) blk_mq_put_rq_ref(rq); return ret; } /** * bt_tags_for_each - iterate over the requests in a tag map * @tags: Tag map to iterate over. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @data, * @reserved) where rq is a pointer to a request. Return true * to continue iterating tags, false to stop. * @data: Will be passed as second argument to @fn. * @flags: BT_TAG_ITER_* */ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, busy_tag_iter_fn *fn, void *data, unsigned int flags) { struct bt_tags_iter_data iter_data = { .tags = tags, .fn = fn, .data = data, .flags = flags, }; if (tags->rqs) sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); } static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv, unsigned int flags) { WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); if (tags->nr_reserved_tags) bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, flags | BT_TAG_ITER_RESERVED); bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); } /** * blk_mq_all_tag_iter - iterate over all requests in a tag map * @tags: Tag map to iterate over. * @fn: Pointer to the function that will be called for each * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' * indicates whether or not @rq is a reserved request. Return * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. * * Caller has to pass the tag map from which requests are allocated. */ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv) { __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS); } /** * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set * @tagset: Tag set to iterate over. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' * indicates whether or not @rq is a reserved request. Return * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. * * We grab one request reference before calling @fn and release it after * @fn returns. */ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { unsigned int flags = tagset->flags; int i, nr_tags; nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; for (i = 0; i < nr_tags; i++) { if (tagset->tags && tagset->tags[i]) __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); } } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data) { unsigned *count = data; if (blk_mq_request_completed(rq)) (*count)++; return true; } /** * blk_mq_tagset_wait_completed_request - Wait until all scheduled request * completions have finished. * @tagset: Tag set to drain completed request * * Note: This function has to be run after all IO queues are shutdown */ void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) { while (true) { unsigned count = 0; blk_mq_tagset_busy_iter(tagset, blk_mq_tagset_count_completed_rqs, &count); if (!count) break; msleep(5); } } EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); /** * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag * @q: Request queue to examine. * @fn: Pointer to the function that will be called for each request * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, * reserved) where rq is a pointer to a request and hctx points * to the hardware queue associated with the request. 'reserved' * indicates whether or not @rq is a reserved request. * @priv: Will be passed as third argument to @fn. * * Note: if @q->tag_set is shared with other request queues then @fn will be * called for all requests on all queues that share that tag set and not only * for requests associated with @q. */ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table * while the queue is frozen. So we can use q_usage_counter to avoid * racing with it. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; if (blk_mq_is_shared_tags(q->tag_set->flags)) { struct blk_mq_tags *tags = q->tag_set->shared_tags; struct sbitmap_queue *bresv = &tags->breserved_tags; struct sbitmap_queue *btags = &tags->bitmap_tags; if (tags->nr_reserved_tags) bt_for_each(NULL, q, bresv, fn, priv, true); bt_for_each(NULL, q, btags, fn, priv, false); } else { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; struct sbitmap_queue *bresv = &tags->breserved_tags; struct sbitmap_queue *btags = &tags->bitmap_tags; /* * If no software queues are currently mapped to this * hardware queue, there's nothing to check */ if (!blk_mq_hw_queue_mapped(hctx)) continue; if (tags->nr_reserved_tags) bt_for_each(hctx, q, bresv, fn, priv, true); bt_for_each(hctx, q, btags, fn, priv, false); } } blk_queue_exit(q); } static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, bool round_robin, int node) { return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, node); } int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, struct sbitmap_queue *breserved_tags, unsigned int queue_depth, unsigned int reserved, int node, int alloc_policy) { unsigned int depth = queue_depth - reserved; bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; if (bt_alloc(bitmap_tags, depth, round_robin, node)) return -ENOMEM; if (bt_alloc(breserved_tags, reserved, round_robin, node)) goto free_bitmap_tags; return 0; free_bitmap_tags: sbitmap_queue_free(bitmap_tags); return -ENOMEM; } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node, int alloc_policy) { struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { pr_err("blk-mq: tag depth too large\n"); return NULL; } tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); if (!tags) return NULL; tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, total_tags, reserved_tags, node, alloc_policy) < 0) { kfree(tags); return NULL; } return tags; } void blk_mq_free_tags(struct blk_mq_tags *tags) { sbitmap_queue_free(&tags->bitmap_tags); sbitmap_queue_free(&tags->breserved_tags); kfree(tags); } int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tagsptr, unsigned int tdepth, bool can_grow) { struct blk_mq_tags *tags = *tagsptr; if (tdepth <= tags->nr_reserved_tags) return -EINVAL; /* * If we are allowed to grow beyond the original size, allocate * a new set of tags before freeing the old one. */ if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; struct blk_mq_tags *new; if (!can_grow) return -EINVAL; /* * We need some sort of upper limit, set it high enough that * no valid use cases should require more. */ if (tdepth > MAX_SCHED_RQ) return -EINVAL; /* * Only the sbitmap needs resizing since we allocated the max * initially. */ if (blk_mq_is_shared_tags(set->flags)) return 0; new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); if (!new) return -ENOMEM; blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); *tagsptr = new; } else { /* * Don't need (or can't) update reserved tags here, they * remain static and should never need resizing. */ sbitmap_queue_resize(&tags->bitmap_tags, tdepth - tags->nr_reserved_tags); } return 0; } void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size) { struct blk_mq_tags *tags = set->shared_tags; sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags); } void blk_mq_tag_update_sched_shared_tags(struct request_queue *q) { sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags, q->nr_requests - q->tag_set->reserved_tags); } /** * blk_mq_unique_tag() - return a tag that is unique queue-wide * @rq: request for which to compute a unique tag * * The tag field in struct request is unique per hardware queue but not over * all hardware queues. Hence this function that returns a tag with the * hardware context index in the upper bits and the per hardware queue tag in * the lower bits. * * Note: When called for a request that is queued on a non-multiqueue request * queue, the hardware context index is set to zero. */ u32 blk_mq_unique_tag(struct request *rq) { return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); } EXPORT_SYMBOL(blk_mq_unique_tag); |
72 72 72 86 77 11 8 86 83 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 | // SPDX-License-Identifier: MIT /* * Copyright (C) 2019 Google, Inc. * * Authors: * Sean Paul <seanpaul@chromium.org> */ #include <linux/average.h> #include <linux/bitops.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_connector.h> #include <drm/drm_crtc.h> #include <drm/drm_device.h> #include <drm/drm_mode_config.h> #include <drm/drm_modeset_lock.h> #include <drm/drm_print.h> #include <drm/drm_self_refresh_helper.h> /** * DOC: overview * * This helper library provides an easy way for drivers to leverage the atomic * framework to implement panel self refresh (SR) support. Drivers are * responsible for initializing and cleaning up the SR helpers on load/unload * (see &drm_self_refresh_helper_init/&drm_self_refresh_helper_cleanup). * The connector is responsible for setting * &drm_connector_state.self_refresh_aware to true at runtime if it is SR-aware * (meaning it knows how to initiate self refresh on the panel). * * Once a crtc has enabled SR using &drm_self_refresh_helper_init, the * helpers will monitor activity and call back into the driver to enable/disable * SR as appropriate. The best way to think about this is that it's a DPMS * on/off request with &drm_crtc_state.self_refresh_active set in crtc state * that tells you to disable/enable SR on the panel instead of power-cycling it. * * During SR, drivers may choose to fully disable their crtc/encoder/bridge * hardware (in which case no driver changes are necessary), or they can inspect * &drm_crtc_state.self_refresh_active if they want to enter low power mode * without full disable (in case full disable/enable is too slow). * * SR will be deactivated if there are any atomic updates affecting the * pipe that is in SR mode. If a crtc is driving multiple connectors, all * connectors must be SR aware and all will enter/exit SR mode at the same time. * * If the crtc and connector are SR aware, but the panel connected does not * support it (or is otherwise unable to enter SR), the driver should fail * atomic_check when &drm_crtc_state.self_refresh_active is true. */ #define SELF_REFRESH_AVG_SEED_MS 200 DECLARE_EWMA(psr_time, 4, 4) struct drm_self_refresh_data { struct drm_crtc *crtc; struct delayed_work entry_work; struct mutex avg_mutex; struct ewma_psr_time entry_avg_ms; struct ewma_psr_time exit_avg_ms; }; static void drm_self_refresh_helper_entry_work(struct work_struct *work) { struct drm_self_refresh_data *sr_data = container_of( to_delayed_work(work), struct drm_self_refresh_data, entry_work); struct drm_crtc *crtc = sr_data->crtc; struct drm_device *dev = crtc->dev; struct drm_modeset_acquire_ctx ctx; struct drm_atomic_state *state; struct drm_connector *conn; struct drm_connector_state *conn_state; struct drm_crtc_state *crtc_state; int i, ret = 0; drm_modeset_acquire_init(&ctx, 0); state = drm_atomic_state_alloc(dev); if (!state) { ret = -ENOMEM; goto out_drop_locks; } retry: state->acquire_ctx = &ctx; crtc_state = drm_atomic_get_crtc_state(state, crtc); if (IS_ERR(crtc_state)) { ret = PTR_ERR(crtc_state); goto out; } if (!crtc_state->enable) goto out; ret = drm_atomic_add_affected_connectors(state, crtc); if (ret) goto out; for_each_new_connector_in_state(state, conn, conn_state, i) { if (!conn_state->self_refresh_aware) goto out; } crtc_state->active = false; crtc_state->self_refresh_active = true; ret = drm_atomic_commit(state); if (ret) goto out; out: if (ret == -EDEADLK) { drm_atomic_state_clear(state); ret = drm_modeset_backoff(&ctx); if (!ret) goto retry; } drm_atomic_state_put(state); out_drop_locks: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); } /** * drm_self_refresh_helper_update_avg_times - Updates a crtc's SR time averages * @state: the state which has just been applied to hardware * @commit_time_ms: the amount of time in ms that this commit took to complete * @new_self_refresh_mask: bitmask of crtc's that have self_refresh_active in * new state * * Called after &drm_mode_config_funcs.atomic_commit_tail, this function will * update the average entry/exit self refresh times on self refresh transitions. * These averages will be used when calculating how long to delay before * entering self refresh mode after activity. */ void drm_self_refresh_helper_update_avg_times(struct drm_atomic_state *state, unsigned int commit_time_ms, unsigned int new_self_refresh_mask) { struct drm_crtc *crtc; struct drm_crtc_state *old_crtc_state; int i; for_each_old_crtc_in_state(state, crtc, old_crtc_state, i) { bool new_self_refresh_active = new_self_refresh_mask & BIT(i); struct drm_self_refresh_data *sr_data = crtc->self_refresh_data; struct ewma_psr_time *time; if (old_crtc_state->self_refresh_active == new_self_refresh_active) continue; if (new_self_refresh_active) time = &sr_data->entry_avg_ms; else time = &sr_data->exit_avg_ms; mutex_lock(&sr_data->avg_mutex); ewma_psr_time_add(time, commit_time_ms); mutex_unlock(&sr_data->avg_mutex); } } EXPORT_SYMBOL(drm_self_refresh_helper_update_avg_times); /** * drm_self_refresh_helper_alter_state - Alters the atomic state for SR exit * @state: the state currently being checked * * Called at the end of atomic check. This function checks the state for flags * incompatible with self refresh exit and changes them. This is a bit * disingenuous since userspace is expecting one thing and we're giving it * another. However in order to keep self refresh entirely hidden from * userspace, this is required. * * At the end, we queue up the self refresh entry work so we can enter PSR after * the desired delay. */ void drm_self_refresh_helper_alter_state(struct drm_atomic_state *state) { struct drm_crtc *crtc; struct drm_crtc_state *crtc_state; int i; if (state->async_update || !state->allow_modeset) { for_each_old_crtc_in_state(state, crtc, crtc_state, i) { if (crtc_state->self_refresh_active) { state->async_update = false; state->allow_modeset = true; break; } } } for_each_new_crtc_in_state(state, crtc, crtc_state, i) { struct drm_self_refresh_data *sr_data; unsigned int delay; /* Don't trigger the entry timer when we're already in SR */ if (crtc_state->self_refresh_active) continue; sr_data = crtc->self_refresh_data; if (!sr_data) continue; mutex_lock(&sr_data->avg_mutex); delay = (ewma_psr_time_read(&sr_data->entry_avg_ms) + ewma_psr_time_read(&sr_data->exit_avg_ms)) * 2; mutex_unlock(&sr_data->avg_mutex); mod_delayed_work(system_wq, &sr_data->entry_work, msecs_to_jiffies(delay)); } } EXPORT_SYMBOL(drm_self_refresh_helper_alter_state); /** * drm_self_refresh_helper_init - Initializes self refresh helpers for a crtc * @crtc: the crtc which supports self refresh supported displays * * Returns zero if successful or -errno on failure */ int drm_self_refresh_helper_init(struct drm_crtc *crtc) { struct drm_self_refresh_data *sr_data = crtc->self_refresh_data; /* Helper is already initialized */ if (WARN_ON(sr_data)) return -EINVAL; sr_data = kzalloc(sizeof(*sr_data), GFP_KERNEL); if (!sr_data) return -ENOMEM; INIT_DELAYED_WORK(&sr_data->entry_work, drm_self_refresh_helper_entry_work); sr_data->crtc = crtc; mutex_init(&sr_data->avg_mutex); ewma_psr_time_init(&sr_data->entry_avg_ms); ewma_psr_time_init(&sr_data->exit_avg_ms); /* * Seed the averages so they're non-zero (and sufficiently large * for even poorly performing panels). As time goes on, this will be * averaged out and the values will trend to their true value. */ ewma_psr_time_add(&sr_data->entry_avg_ms, SELF_REFRESH_AVG_SEED_MS); ewma_psr_time_add(&sr_data->exit_avg_ms, SELF_REFRESH_AVG_SEED_MS); crtc->self_refresh_data = sr_data; return 0; } EXPORT_SYMBOL(drm_self_refresh_helper_init); /** * drm_self_refresh_helper_cleanup - Cleans up self refresh helpers for a crtc * @crtc: the crtc to cleanup */ void drm_self_refresh_helper_cleanup(struct drm_crtc *crtc) { struct drm_self_refresh_data *sr_data = crtc->self_refresh_data; /* Helper is already uninitialized */ if (!sr_data) return; crtc->self_refresh_data = NULL; cancel_delayed_work_sync(&sr_data->entry_work); kfree(sr_data); } EXPORT_SYMBOL(drm_self_refresh_helper_cleanup); |
7 2 2 2 2 2 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi) * Copyright (C) Darryl Miles G7LED (dlm@g7led.demon.co.uk) * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> static void ax25_heartbeat_expiry(struct timer_list *); static void ax25_t1timer_expiry(struct timer_list *); static void ax25_t2timer_expiry(struct timer_list *); static void ax25_t3timer_expiry(struct timer_list *); static void ax25_idletimer_expiry(struct timer_list *); void ax25_setup_timers(ax25_cb *ax25) { timer_setup(&ax25->timer, ax25_heartbeat_expiry, 0); timer_setup(&ax25->t1timer, ax25_t1timer_expiry, 0); timer_setup(&ax25->t2timer, ax25_t2timer_expiry, 0); timer_setup(&ax25->t3timer, ax25_t3timer_expiry, 0); timer_setup(&ax25->idletimer, ax25_idletimer_expiry, 0); } void ax25_start_heartbeat(ax25_cb *ax25) { mod_timer(&ax25->timer, jiffies + 5 * HZ); } void ax25_start_t1timer(ax25_cb *ax25) { mod_timer(&ax25->t1timer, jiffies + ax25->t1); } void ax25_start_t2timer(ax25_cb *ax25) { mod_timer(&ax25->t2timer, jiffies + ax25->t2); } void ax25_start_t3timer(ax25_cb *ax25) { if (ax25->t3 > 0) mod_timer(&ax25->t3timer, jiffies + ax25->t3); else del_timer(&ax25->t3timer); } void ax25_start_idletimer(ax25_cb *ax25) { if (ax25->idle > 0) mod_timer(&ax25->idletimer, jiffies + ax25->idle); else del_timer(&ax25->idletimer); } void ax25_stop_heartbeat(ax25_cb *ax25) { del_timer(&ax25->timer); } void ax25_stop_t1timer(ax25_cb *ax25) { del_timer(&ax25->t1timer); } void ax25_stop_t2timer(ax25_cb *ax25) { del_timer(&ax25->t2timer); } void ax25_stop_t3timer(ax25_cb *ax25) { del_timer(&ax25->t3timer); } void ax25_stop_idletimer(ax25_cb *ax25) { del_timer(&ax25->idletimer); } int ax25_t1timer_running(ax25_cb *ax25) { return timer_pending(&ax25->t1timer); } unsigned long ax25_display_timer(struct timer_list *timer) { long delta = timer->expires - jiffies; if (!timer_pending(timer)) return 0; return max(0L, delta); } EXPORT_SYMBOL(ax25_display_timer); static void ax25_heartbeat_expiry(struct timer_list *t) { int proto = AX25_PROTO_STD_SIMPLEX; ax25_cb *ax25 = from_timer(ax25, t, timer); if (ax25->ax25_dev) proto = ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]; switch (proto) { case AX25_PROTO_STD_SIMPLEX: case AX25_PROTO_STD_DUPLEX: ax25_std_heartbeat_expiry(ax25); break; #ifdef CONFIG_AX25_DAMA_SLAVE case AX25_PROTO_DAMA_SLAVE: if (ax25->ax25_dev->dama.slave) ax25_ds_heartbeat_expiry(ax25); else ax25_std_heartbeat_expiry(ax25); break; #endif } } static void ax25_t1timer_expiry(struct timer_list *t) { ax25_cb *ax25 = from_timer(ax25, t, t1timer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: case AX25_PROTO_STD_DUPLEX: ax25_std_t1timer_expiry(ax25); break; #ifdef CONFIG_AX25_DAMA_SLAVE case AX25_PROTO_DAMA_SLAVE: if (!ax25->ax25_dev->dama.slave) ax25_std_t1timer_expiry(ax25); break; #endif } } static void ax25_t2timer_expiry(struct timer_list *t) { ax25_cb *ax25 = from_timer(ax25, t, t2timer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: case AX25_PROTO_STD_DUPLEX: ax25_std_t2timer_expiry(ax25); break; #ifdef CONFIG_AX25_DAMA_SLAVE case AX25_PROTO_DAMA_SLAVE: if (!ax25->ax25_dev->dama.slave) ax25_std_t2timer_expiry(ax25); break; #endif } } static void ax25_t3timer_expiry(struct timer_list *t) { ax25_cb *ax25 = from_timer(ax25, t, t3timer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: case AX25_PROTO_STD_DUPLEX: ax25_std_t3timer_expiry(ax25); break; #ifdef CONFIG_AX25_DAMA_SLAVE case AX25_PROTO_DAMA_SLAVE: if (ax25->ax25_dev->dama.slave) ax25_ds_t3timer_expiry(ax25); else ax25_std_t3timer_expiry(ax25); break; #endif } } static void ax25_idletimer_expiry(struct timer_list *t) { ax25_cb *ax25 = from_timer(ax25, t, idletimer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: case AX25_PROTO_STD_DUPLEX: ax25_std_idletimer_expiry(ax25); break; #ifdef CONFIG_AX25_DAMA_SLAVE case AX25_PROTO_DAMA_SLAVE: if (ax25->ax25_dev->dama.slave) ax25_ds_idletimer_expiry(ax25); else ax25_std_idletimer_expiry(ax25); break; #endif } } |
45 2 13 35 107 251 20 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BITOPS_H #define _LINUX_BITOPS_H #include <asm/types.h> #include <linux/bits.h> #include <linux/typecheck.h> #include <uapi/linux/kernel.h> /* Set bits in the first 'n' bytes when loaded from memory */ #ifdef __LITTLE_ENDIAN # define aligned_byte_mask(n) ((1UL << 8*(n))-1) #else # define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n))) #endif #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) #define BITS_TO_U64(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64)) #define BITS_TO_U32(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); extern unsigned long __sw_hweight64(__u64 w); /* * Defined here because those may be needed by architecture-specific static * inlines. */ #include <asm-generic/bitops/generic-non-atomic.h> /* * Many architecture-specific non-atomic bitops contain inline asm code and due * to that the compiler can't optimize them to compile-time expressions or * constants. In contrary, generic_*() helpers are defined in pure C and * compilers optimize them just well. * Therefore, to make `unsigned long foo = 0; __set_bit(BAR, &foo)` effectively * equal to `unsigned long foo = BIT(BAR)`, pick the generic C alternative when * the arguments can be resolved at compile time. That expression itself is a * constant and doesn't bring any functional changes to the rest of cases. * The casts to `uintptr_t` are needed to mitigate `-Waddress` warnings when * passing a bitmap from .bss or .data (-> `!!addr` is always true). */ #define bitop(op, nr, addr) \ ((__builtin_constant_p(nr) && \ __builtin_constant_p((uintptr_t)(addr) != (uintptr_t)NULL) && \ (uintptr_t)(addr) != (uintptr_t)NULL && \ __builtin_constant_p(*(const unsigned long *)(addr))) ? \ const##op(nr, addr) : op(nr, addr)) #define __set_bit(nr, addr) bitop(___set_bit, nr, addr) #define __clear_bit(nr, addr) bitop(___clear_bit, nr, addr) #define __change_bit(nr, addr) bitop(___change_bit, nr, addr) #define __test_and_set_bit(nr, addr) bitop(___test_and_set_bit, nr, addr) #define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) #define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) #define test_bit(nr, addr) bitop(_test_bit, nr, addr) #define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr) /* * Include this here because some architectures need generic_ffs/fls in * scope */ #include <asm/bitops.h> /* Check that the bitops prototypes are sane */ #define __check_bitop_pr(name) \ static_assert(__same_type(arch_##name, generic_##name) && \ __same_type(const_##name, generic_##name) && \ __same_type(_##name, generic_##name)) __check_bitop_pr(__set_bit); __check_bitop_pr(__clear_bit); __check_bitop_pr(__change_bit); __check_bitop_pr(__test_and_set_bit); __check_bitop_pr(__test_and_clear_bit); __check_bitop_pr(__test_and_change_bit); __check_bitop_pr(test_bit); #undef __check_bitop_pr static inline int get_bitmask_order(unsigned int count) { int order; order = fls(count); return order; /* We could be slightly more clever with -1 here... */ } static __always_inline unsigned long hweight_long(unsigned long w) { return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w); } /** * rol64 - rotate a 64-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u64 rol64(__u64 word, unsigned int shift) { return (word << (shift & 63)) | (word >> ((-shift) & 63)); } /** * ror64 - rotate a 64-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u64 ror64(__u64 word, unsigned int shift) { return (word >> (shift & 63)) | (word << ((-shift) & 63)); } /** * rol32 - rotate a 32-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u32 rol32(__u32 word, unsigned int shift) { return (word << (shift & 31)) | (word >> ((-shift) & 31)); } /** * ror32 - rotate a 32-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u32 ror32(__u32 word, unsigned int shift) { return (word >> (shift & 31)) | (word << ((-shift) & 31)); } /** * rol16 - rotate a 16-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u16 rol16(__u16 word, unsigned int shift) { return (word << (shift & 15)) | (word >> ((-shift) & 15)); } /** * ror16 - rotate a 16-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u16 ror16(__u16 word, unsigned int shift) { return (word >> (shift & 15)) | (word << ((-shift) & 15)); } /** * rol8 - rotate an 8-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u8 rol8(__u8 word, unsigned int shift) { return (word << (shift & 7)) | (word >> ((-shift) & 7)); } /** * ror8 - rotate an 8-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u8 ror8(__u8 word, unsigned int shift) { return (word >> (shift & 7)) | (word << ((-shift) & 7)); } /** * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<32) to sign bit * * This is safe to use for 16- and 8-bit types as well. */ static __always_inline __s32 sign_extend32(__u32 value, int index) { __u8 shift = 31 - index; return (__s32)(value << shift) >> shift; } /** * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<64) to sign bit */ static __always_inline __s64 sign_extend64(__u64 value, int index) { __u8 shift = 63 - index; return (__s64)(value << shift) >> shift; } static inline unsigned fls_long(unsigned long l) { if (sizeof(l) == 4) return fls(l); return fls64(l); } static inline int get_count_order(unsigned int count) { if (count == 0) return -1; return fls(--count); } /** * get_count_order_long - get order after rounding @l up to power of 2 * @l: parameter * * it is same as get_count_order() but with long type parameter */ static inline int get_count_order_long(unsigned long l) { if (l == 0UL) return -1; return (int)fls_long(--l); } /** * __ffs64 - find first set bit in a 64 bit word * @word: The 64 bit word * * On 64 bit arches this is a synonym for __ffs * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ static inline unsigned long __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) return __ffs((u32)(word >> 32)) + 32; #elif BITS_PER_LONG != 64 #error BITS_PER_LONG not 32 or 64 #endif return __ffs((unsigned long)word); } /** * fns - find N'th set bit in a word * @word: The word to search * @n: Bit to find */ static inline unsigned long fns(unsigned long word, unsigned int n) { unsigned int bit; while (word) { bit = __ffs(word); if (n-- == 0) return bit; __clear_bit(bit, &word); } return BITS_PER_LONG; } /** * assign_bit - Assign value to a bit in memory * @nr: the bit to set * @addr: the address to start counting from * @value: the value to assign */ static __always_inline void assign_bit(long nr, volatile unsigned long *addr, bool value) { if (value) set_bit(nr, addr); else clear_bit(nr, addr); } static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, bool value) { if (value) __set_bit(nr, addr); else __clear_bit(nr, addr); } /** * __ptr_set_bit - Set bit in a pointer's value * @nr: the bit to set * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_set_bit(bit, &p); */ #define __ptr_set_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __set_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_clear_bit - Clear bit in a pointer's value * @nr: the bit to clear * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_clear_bit(bit, &p); */ #define __ptr_clear_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __clear_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_test_bit - Test bit in a pointer's value * @nr: the bit to test * @addr: the address of the pointer variable * * Example: * void *p = foo(); * if (__ptr_test_bit(bit, &p)) { * ... * } else { * ... * } */ #define __ptr_test_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ test_bit(nr, (unsigned long *)(addr)); \ }) #ifdef __KERNEL__ #ifndef set_mask_bits #define set_mask_bits(ptr, mask, bits) \ ({ \ const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ new__ = (old__ & ~mask__) | bits__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ old__; \ }) #endif #ifndef bit_clear_unless #define bit_clear_unless(ptr, clear, test) \ ({ \ const typeof(*(ptr)) clear__ = (clear), test__ = (test);\ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ if (old__ & test__) \ break; \ new__ = old__ & ~clear__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ !(old__ & test__); \ }) #endif #endif /* __KERNEL__ */ #endif |
14 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_ACCESSORS_H #define BTRFS_ACCESSORS_H #include <asm/unaligned.h> #include <linux/stddef.h> #include <linux/types.h> #include <linux/align.h> #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/string.h> #include <linux/mm.h> #include <uapi/linux/btrfs_tree.h> struct extent_buffer; struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; unsigned long offset; }; void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); /* * Some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple one * for u8: */ #define le8_to_cpu(v) (v) #define cpu_to_le8(v) (v) #define __le8 u8 static inline u8 get_unaligned_le8(const void *p) { return *(u8 *)p; } static inline void put_unaligned_le8(u8 val, void *p) { *(u8 *)p = val; } #define read_eb_member(eb, ptr, type, member, result) (\ read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ sizeof_field(type, member))) #define write_eb_member(eb, ptr, type, member, result) (\ write_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ sizeof_field(type, member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off); \ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off, \ u##bits val); \ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off); \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val); DECLARE_BTRFS_SETGET_BITS(8) DECLARE_BTRFS_SETGET_BITS(16) DECLARE_BTRFS_SETGET_BITS(32) DECLARE_BTRFS_SETGET_BITS(64) #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ const type *p = folio_address(eb->folios[0]) + \ offset_in_page(eb->start); \ return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \ put_unaligned_le##bits(val, &p->member); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const type *s) \ { \ return get_unaligned_le##bits(&s->member); \ } \ static inline void btrfs_set_##name(type *s, u##bits val) \ { \ put_unaligned_le##bits(val, &s->member); \ } static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64); BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, io_width, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, seek_speed, 8); BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, bandwidth, 8); BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, generation, 64); static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) { return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); } static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) { return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); } BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) { return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); } BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, sector_size, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr) { unsigned long offset = (unsigned long)c; offset += offsetof(struct btrfs_chunk, stripe); offset += nr * sizeof(struct btrfs_stripe); return (struct btrfs_stripe *)offset; } static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) { return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); } static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); } static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, struct btrfs_chunk *c, int nr, u64 val) { btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); } static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); } static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, struct btrfs_chunk *c, int nr, u64 val) { btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); } /* struct btrfs_block_group_item */ BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, used, 64); BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); BTRFS_SETGET_FUNCS(block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, struct btrfs_block_group_item, flags, 64); /* struct btrfs_free_space_info */ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, extent_count, 32); BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); /* struct btrfs_inode_ref */ BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_index, struct btrfs_inode_ref, index, 64); /* struct btrfs_inode_extref */ BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, parent_objectid, 64); BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, name_len, 16); BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); /* struct btrfs_inode_item */ BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, sequence, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, transid, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, block_group, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8); BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64); BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64); BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8); BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64); /* struct btrfs_dev_extent */ BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, chunk_objectid, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_objectid, struct btrfs_dev_extent, chunk_objectid, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_length, struct btrfs_dev_extent, length, 64); BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); static inline void btrfs_tree_block_key(const struct extent_buffer *eb, struct btrfs_tree_block_info *item, struct btrfs_disk_key *key) { read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); } static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, struct btrfs_tree_block_info *item, struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); } BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64); BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, objectid, 64); BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, offset, 64); BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32); BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref, root_id, 64); BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, type, 8); BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, offset, 64); static inline u32 btrfs_extent_inline_ref_size(int type) { if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY) return sizeof(struct btrfs_extent_inline_ref); if (type == BTRFS_SHARED_DATA_REF_KEY) return sizeof(struct btrfs_shared_data_ref) + sizeof(struct btrfs_extent_inline_ref); if (type == BTRFS_EXTENT_DATA_REF_KEY) return sizeof(struct btrfs_extent_data_ref) + offsetof(struct btrfs_extent_inline_ref, offset); if (type == BTRFS_EXTENT_OWNER_REF_KEY) return sizeof(struct btrfs_extent_inline_ref); return 0; } /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, generation, 64); static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); } static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, int nr, u64 val) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); } static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); } static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, int nr, u64 val) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); } static inline unsigned long btrfs_node_key_ptr_offset(const struct extent_buffer *eb, int nr) { return offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; } void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr); static inline void btrfs_set_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr; ptr = btrfs_node_key_ptr_offset(eb, nr); write_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } /* struct btrfs_item */ BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); static inline unsigned long btrfs_item_nr_offset(const struct extent_buffer *eb, int nr) { return offsetof(struct btrfs_leaf, items) + sizeof(struct btrfs_item) * nr; } static inline struct btrfs_item *btrfs_item_nr(const struct extent_buffer *eb, int nr) { return (struct btrfs_item *)btrfs_item_nr_offset(eb, nr); } #define BTRFS_ITEM_SETGET_FUNCS(member) \ static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot) \ { \ return btrfs_raw_item_##member(eb, btrfs_item_nr(eb, slot)); \ } \ static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ int slot, u32 val) \ { \ btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \ } \ static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ int slot) \ { \ struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ return btrfs_token_raw_item_##member(token, item); \ } \ static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ int slot, u32 val) \ { \ struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ btrfs_set_token_raw_item_##member(token, item, val); \ } BTRFS_ITEM_SETGET_FUNCS(offset) BTRFS_ITEM_SETGET_FUNCS(size); static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) { return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); } static inline void btrfs_item_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(eb, nr); read_eb_member(eb, item, struct btrfs_item, key, disk_key); } static inline void btrfs_set_item_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(eb, nr); write_eb_member(eb, item, struct btrfs_item, key, disk_key); } BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); /* struct btrfs_root_ref */ BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_root_ref_dirid, struct btrfs_root_ref, dirid, 64); BTRFS_SETGET_STACK_FUNCS(stack_root_ref_sequence, struct btrfs_root_ref, sequence, 64); BTRFS_SETGET_STACK_FUNCS(stack_root_ref_name_len, struct btrfs_root_ref, name_len, 16); /* struct btrfs_dir_item */ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, type, 8); BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); BTRFS_SETGET_STACK_FUNCS(stack_dir_flags, struct btrfs_dir_item, type, 8); BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); static inline u8 btrfs_dir_ftype(const struct extent_buffer *eb, const struct btrfs_dir_item *item) { return btrfs_dir_flags_to_ftype(btrfs_dir_flags(eb, item)); } static inline u8 btrfs_stack_dir_ftype(const struct btrfs_dir_item *item) { return btrfs_dir_flags_to_ftype(btrfs_stack_dir_flags(item)); } static inline void btrfs_dir_item_key(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_disk_key *key) { read_eb_member(eb, item, struct btrfs_dir_item, location, key); } static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, struct btrfs_dir_item *item, const struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_dir_item, location, key); } BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, num_entries, 64); BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, num_bitmaps, 64); BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, generation, 64); static inline void btrfs_free_space_key(const struct extent_buffer *eb, const struct btrfs_free_space_header *h, struct btrfs_disk_key *key) { read_eb_member(eb, h, struct btrfs_free_space_header, location, key); } static inline void btrfs_set_free_space_key(struct extent_buffer *eb, struct btrfs_free_space_header *h, const struct btrfs_disk_key *key) { write_eb_member(eb, h, struct btrfs_free_space_header, location, key); } /* struct btrfs_disk_key */ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); #ifdef __LITTLE_ENDIAN /* * Optimized helpers for little-endian architectures where CPU and on-disk * structures have the same endianness and we can skip conversions. */ static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, const struct btrfs_disk_key *disk_key) { memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); } static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, const struct btrfs_key *cpu_key) { memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); } static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *cpu_key, int nr) { struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; btrfs_node_key(eb, disk_key, nr); } static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *cpu_key, int nr) { struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; btrfs_item_key(eb, disk_key, nr); } static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_key *cpu_key) { struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; btrfs_dir_item_key(eb, item, disk_key); } #else static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, const struct btrfs_disk_key *disk) { cpu->offset = le64_to_cpu(disk->offset); cpu->type = disk->type; cpu->objectid = le64_to_cpu(disk->objectid); } static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, const struct btrfs_key *cpu) { disk->offset = cpu_to_le64(cpu->offset); disk->type = cpu->type; disk->objectid = cpu_to_le64(cpu->objectid); } static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_node_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_item_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_key *key) { struct btrfs_disk_key disk_key; btrfs_dir_item_key(eb, item, &disk_key); btrfs_disk_key_to_cpu(key, &disk_key); } #endif /* struct btrfs_header */ BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64); BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) { return (btrfs_header_flags(eb) & flag) == flag; } static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags | flag); } static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags & ~flag); } static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) { u64 flags = btrfs_header_flags(eb); return flags >> BTRFS_BACKREF_REV_SHIFT; } static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev) { u64 flags = btrfs_header_flags(eb); flags &= ~BTRFS_BACKREF_REV_MASK; flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; btrfs_set_header_flags(eb, flags); } static inline int btrfs_is_leaf(const struct extent_buffer *eb) { return btrfs_header_level(eb) == 0; } /* struct btrfs_root_item */ BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64); BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, last_snapshot, 64); BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, generation_v2, 64); BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64); BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64); BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64); BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, tree_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, tree_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, chunk_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, chunk_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, extent_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, extent_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, extent_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, fs_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, fs_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, fs_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, dev_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, dev_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, dev_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, csum_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, csum_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, csum_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); /* struct btrfs_balance_item */ BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); static inline void btrfs_balance_data(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } static inline void btrfs_set_balance_data(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } static inline void btrfs_balance_meta(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } static inline void btrfs_set_balance_meta(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } static inline void btrfs_balance_sys(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } static inline void btrfs_set_balance_sys(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, generation, 64); BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, struct btrfs_super_block, sys_chunk_array_size, 32); BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, struct btrfs_super_block, chunk_root_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, chunk_root, 64); BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, log_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, sectorsize, 32); BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, nodesize, 32); BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, stripesize, 32); BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, root_dir_objectid, 64); BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, num_devices, 64); BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, compat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, compat_ro_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, incompat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, csum_type, 16); BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, cache_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, nr_global_roots, 64); /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, struct btrfs_file_extent_item, offset, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, struct btrfs_file_extent_item, ram_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, struct btrfs_file_extent_item, disk_num_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, struct btrfs_file_extent_item, compression, 8); BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, disk_num_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, offset, 64); BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, ram_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, compression, 8); BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, encryption, 8); BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, other_encoding, 16); /* btrfs_qgroup_status_item */ BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, generation, 64); BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, version, 64); BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, rescan, 64); BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item, enable_gen, 64); /* btrfs_qgroup_info_item */ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, generation, 64); BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, rfer_cmpr, 64); BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, excl_cmpr, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, struct btrfs_qgroup_info_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, rfer_cmpr, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, excl_cmpr, 64); /* btrfs_qgroup_limit_item */ BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, max_rfer, 64); BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, max_excl, 64); BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, rsv_rfer, 64); BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, rsv_excl, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, max_rfer, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, max_excl, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, rsv_rfer, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, rsv_excl, 64); /* btrfs_dev_replace_item */ BTRFS_SETGET_FUNCS(dev_replace_src_devid, struct btrfs_dev_replace_item, src_devid, 64); BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, 64); BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, replace_state, 64); BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, time_started, 64); BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, time_stopped, 64); BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, num_write_errors, 64); BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, struct btrfs_dev_replace_item, num_uncorrectable_read_errors, 64); BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, cursor_left, 64); BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, cursor_right, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, struct btrfs_dev_replace_item, src_devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, struct btrfs_dev_replace_item, replace_state, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, struct btrfs_dev_replace_item, time_started, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, struct btrfs_dev_replace_item, time_stopped, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, struct btrfs_dev_replace_item, num_write_errors, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, struct btrfs_dev_replace_item, num_uncorrectable_read_errors, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, struct btrfs_dev_replace_item, cursor_left, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, struct btrfs_dev_replace_item, cursor_right, 64); /* btrfs_verity_descriptor_item */ BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, encryption, 8); BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, size, 64); BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, struct btrfs_verity_descriptor_item, encryption, 8); BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, struct btrfs_verity_descriptor_item, size, 64); /* Cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ ((unsigned long)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) #endif |
8 7 2 6 3 6 5 3 7 9 9 2 5 5 6 5 7 8 5 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | /* * Poly1305 authenticator algorithm, RFC7539 * * Copyright (C) 2015 Martin Willi * * Based on public domain code by Andrew Moon and Daniel J. Bernstein. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #include <crypto/algapi.h> #include <crypto/internal/hash.h> #include <crypto/internal/poly1305.h> #include <linux/crypto.h> #include <linux/kernel.h> #include <linux/module.h> #include <asm/unaligned.h> static int crypto_poly1305_init(struct shash_desc *desc) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); poly1305_core_init(&dctx->h); dctx->buflen = 0; dctx->rset = 0; dctx->sset = false; return 0; } static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { if (!dctx->sset) { if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { poly1305_core_setkey(&dctx->core_r, src); src += POLY1305_BLOCK_SIZE; srclen -= POLY1305_BLOCK_SIZE; dctx->rset = 2; } if (srclen >= POLY1305_BLOCK_SIZE) { dctx->s[0] = get_unaligned_le32(src + 0); dctx->s[1] = get_unaligned_le32(src + 4); dctx->s[2] = get_unaligned_le32(src + 8); dctx->s[3] = get_unaligned_le32(src + 12); src += POLY1305_BLOCK_SIZE; srclen -= POLY1305_BLOCK_SIZE; dctx->sset = true; } } return srclen; } static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { unsigned int datalen; if (unlikely(!dctx->sset)) { datalen = crypto_poly1305_setdesckey(dctx, src, srclen); src += srclen - datalen; srclen = datalen; } poly1305_core_blocks(&dctx->h, &dctx->core_r, src, srclen / POLY1305_BLOCK_SIZE, 1); } static int crypto_poly1305_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); unsigned int bytes; if (unlikely(dctx->buflen)) { bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); memcpy(dctx->buf + dctx->buflen, src, bytes); src += bytes; srclen -= bytes; dctx->buflen += bytes; if (dctx->buflen == POLY1305_BLOCK_SIZE) { poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE); dctx->buflen = 0; } } if (likely(srclen >= POLY1305_BLOCK_SIZE)) { poly1305_blocks(dctx, src, srclen); src += srclen - (srclen % POLY1305_BLOCK_SIZE); srclen %= POLY1305_BLOCK_SIZE; } if (unlikely(srclen)) { dctx->buflen = srclen; memcpy(dctx->buf, src, srclen); } return 0; } static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); if (unlikely(!dctx->sset)) return -ENOKEY; poly1305_final_generic(dctx, dst); return 0; } static struct shash_alg poly1305_alg = { .digestsize = POLY1305_DIGEST_SIZE, .init = crypto_poly1305_init, .update = crypto_poly1305_update, .final = crypto_poly1305_final, .descsize = sizeof(struct poly1305_desc_ctx), .base = { .cra_name = "poly1305", .cra_driver_name = "poly1305-generic", .cra_priority = 100, .cra_blocksize = POLY1305_BLOCK_SIZE, .cra_module = THIS_MODULE, }, }; static int __init poly1305_mod_init(void) { return crypto_register_shash(&poly1305_alg); } static void __exit poly1305_mod_exit(void) { crypto_unregister_shash(&poly1305_alg); } subsys_initcall(poly1305_mod_init); module_exit(poly1305_mod_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); MODULE_DESCRIPTION("Poly1305 authenticator"); MODULE_ALIAS_CRYPTO("poly1305"); MODULE_ALIAS_CRYPTO("poly1305-generic"); |
225 34 19 122 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/posix_acl.h (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org> */ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H #include <linux/bug.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <uapi/linux/posix_acl.h> struct user_namespace; struct posix_acl_entry { short e_tag; unsigned short e_perm; union { kuid_t e_uid; kgid_t e_gid; }; }; struct posix_acl { refcount_t a_refcount; struct rcu_head a_rcu; unsigned int a_count; struct posix_acl_entry a_entries[]; }; #define FOREACH_ACL_ENTRY(pa, acl, pe) \ for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++) /* * Duplicate an ACL handle. */ static inline struct posix_acl * posix_acl_dup(struct posix_acl *acl) { if (acl) refcount_inc(&acl->a_refcount); return acl; } /* * Free an ACL handle. */ static inline void posix_acl_release(struct posix_acl *acl) { if (acl && refcount_dec_and_test(&acl->a_refcount)) kfree_rcu(acl, a_rcu); } /* posix_acl.c */ extern void posix_acl_init(struct posix_acl *, int); extern struct posix_acl *posix_acl_alloc(int, gfp_t); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); int set_posix_acl(struct mnt_idmap *, struct dentry *, int, struct posix_acl *); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags); #ifdef CONFIG_FS_POSIX_ACL int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *, struct posix_acl **); int simple_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); struct posix_acl *get_cached_acl(struct inode *inode, int type); void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); void forget_cached_acl(struct inode *inode, int type); void forget_all_cached_acls(struct inode *inode); int posix_acl_valid(struct user_namespace *, const struct posix_acl *); int posix_acl_permission(struct mnt_idmap *, struct inode *, const struct posix_acl *, int); static inline void cache_no_acl(struct inode *inode) { inode->i_acl = NULL; inode->i_default_acl = NULL; } int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size); #else static inline int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { return 0; } #define simple_set_acl NULL static inline int simple_acl_create(struct inode *dir, struct inode *inode) { return 0; } static inline void cache_no_acl(struct inode *inode) { } static inline int posix_acl_create(struct inode *inode, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { *default_acl = *acl = NULL; return 0; } static inline void forget_all_cached_acls(struct inode *inode) { } static inline int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *acl) { return -EOPNOTSUPP; } static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ERR_PTR(-EOPNOTSUPP); } static inline int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return -EOPNOTSUPP; } static inline int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size) { return 0; } #endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_inode_acl(struct inode *inode, int type); #endif /* __LINUX_POSIX_ACL_H */ |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | // SPDX-License-Identifier: GPL-2.0-only /* * ebt_snat * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * June, 2002 * */ #include <linux/module.h> #include <net/sock.h> #include <linux/if_arp.h> #include <net/arp.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_nat.h> static unsigned int ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; if (skb_ensure_writable(skb, 0)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_source, info->mac); if (!(info->target & NAT_ARP_BIT) && eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) { const struct arphdr *ap; struct arphdr _ah; ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah); if (ap == NULL) return EBT_DROP; if (ap->ar_hln != ETH_ALEN) goto out; if (skb_store_bits(skb, sizeof(_ah), info->mac, ETH_ALEN)) return EBT_DROP; } out: return info->target | ~EBT_VERDICT_BITS; } static int ebt_snat_tg_check(const struct xt_tgchk_param *par) { const struct ebt_nat_info *info = par->targinfo; int tmp; tmp = info->target | ~EBT_VERDICT_BITS; if (BASE_CHAIN && tmp == EBT_RETURN) return -EINVAL; if (ebt_invalid_target(tmp)) return -EINVAL; tmp = info->target | EBT_VERDICT_BITS; if ((tmp & ~NAT_ARP_BIT) != ~NAT_ARP_BIT) return -EINVAL; return 0; } static struct xt_target ebt_snat_tg_reg __read_mostly = { .name = "snat", .revision = 0, .family = NFPROTO_BRIDGE, .table = "nat", .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_POST_ROUTING), .target = ebt_snat_tg, .checkentry = ebt_snat_tg_check, .targetsize = sizeof(struct ebt_nat_info), .me = THIS_MODULE, }; static int __init ebt_snat_init(void) { return xt_register_target(&ebt_snat_tg_reg); } static void __exit ebt_snat_fini(void) { xt_unregister_target(&ebt_snat_tg_reg); } module_init(ebt_snat_init); module_exit(ebt_snat_fini); MODULE_DESCRIPTION("Ebtables: Source MAC address translation"); MODULE_LICENSE("GPL"); |
8 38 32 26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ALSA sequencer Timer * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> */ #ifndef __SND_SEQ_TIMER_H #define __SND_SEQ_TIMER_H #include <sound/timer.h> #include <sound/seq_kernel.h> struct snd_seq_timer_tick { snd_seq_tick_time_t cur_tick; /* current tick */ unsigned long resolution; /* time per tick in nsec */ unsigned long fraction; /* current time per tick in nsec */ }; struct snd_seq_timer { /* ... tempo / offset / running state */ unsigned int running:1, /* running state of queue */ initialized:1; /* timer is initialized */ unsigned int tempo; /* current tempo, us/tick */ int ppq; /* time resolution, ticks/quarter */ snd_seq_real_time_t cur_time; /* current time */ struct snd_seq_timer_tick tick; /* current tick */ int tick_updated; int type; /* timer type */ struct snd_timer_id alsa_id; /* ALSA's timer ID */ struct snd_timer_instance *timeri; /* timer instance */ unsigned int ticks; unsigned long preferred_resolution; /* timer resolution, ticks/sec */ unsigned int skew; unsigned int skew_base; struct timespec64 last_update; /* time of last clock update, used for interpolation */ spinlock_t lock; }; /* create new timer (constructor) */ struct snd_seq_timer *snd_seq_timer_new(void); /* delete timer (destructor) */ void snd_seq_timer_delete(struct snd_seq_timer **tmr); /* */ static inline void snd_seq_timer_update_tick(struct snd_seq_timer_tick *tick, unsigned long resolution) { if (tick->resolution > 0) { tick->fraction += resolution; tick->cur_tick += (unsigned int)(tick->fraction / tick->resolution); tick->fraction %= tick->resolution; } } /* compare timestamp between events */ /* return 1 if a >= b; otherwise return 0 */ static inline int snd_seq_compare_tick_time(snd_seq_tick_time_t *a, snd_seq_tick_time_t *b) { /* compare ticks */ return (*a >= *b); } static inline int snd_seq_compare_real_time(snd_seq_real_time_t *a, snd_seq_real_time_t *b) { /* compare real time */ if (a->tv_sec > b->tv_sec) return 1; if ((a->tv_sec == b->tv_sec) && (a->tv_nsec >= b->tv_nsec)) return 1; return 0; } static inline void snd_seq_sanity_real_time(snd_seq_real_time_t *tm) { while (tm->tv_nsec >= 1000000000) { /* roll-over */ tm->tv_nsec -= 1000000000; tm->tv_sec++; } } /* increment timestamp */ static inline void snd_seq_inc_real_time(snd_seq_real_time_t *tm, snd_seq_real_time_t *inc) { tm->tv_sec += inc->tv_sec; tm->tv_nsec += inc->tv_nsec; snd_seq_sanity_real_time(tm); } static inline void snd_seq_inc_time_nsec(snd_seq_real_time_t *tm, unsigned long nsec) { tm->tv_nsec += nsec; snd_seq_sanity_real_time(tm); } /* called by timer isr */ struct snd_seq_queue; int snd_seq_timer_open(struct snd_seq_queue *q); int snd_seq_timer_close(struct snd_seq_queue *q); int snd_seq_timer_midi_open(struct snd_seq_queue *q); int snd_seq_timer_midi_close(struct snd_seq_queue *q); void snd_seq_timer_defaults(struct snd_seq_timer *tmr); void snd_seq_timer_reset(struct snd_seq_timer *tmr); int snd_seq_timer_stop(struct snd_seq_timer *tmr); int snd_seq_timer_start(struct snd_seq_timer *tmr); int snd_seq_timer_continue(struct snd_seq_timer *tmr); int snd_seq_timer_set_tempo(struct snd_seq_timer *tmr, int tempo); int snd_seq_timer_set_tempo_ppq(struct snd_seq_timer *tmr, int tempo, int ppq); int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position); int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position); int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base); snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr, bool adjust_ktime); snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr); extern int seq_default_timer_class; extern int seq_default_timer_sclass; extern int seq_default_timer_card; extern int seq_default_timer_device; extern int seq_default_timer_subdevice; extern int seq_default_timer_resolution; #endif |
5 39 1 58 50 4 4 53 4 1 20 4 34 34 49 46 46 44 13 29 26 18 8 8 8 30 4 4 38 30 4 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 | // SPDX-License-Identifier: GPL-2.0-or-later /* * CDC Ethernet based networking peripherals * Copyright (C) 2003-2005 by David Brownell * Copyright (C) 2006 by Ole Andre Vadla Ravnas (ActiveSync) */ // #define DEBUG // error path messages, extra info // #define VERBOSE // more; success messages #include <linux/module.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/workqueue.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/cdc.h> #include <linux/usb/usbnet.h> #if IS_ENABLED(CONFIG_USB_NET_RNDIS_HOST) static int is_rndis(struct usb_interface_descriptor *desc) { return (desc->bInterfaceClass == USB_CLASS_COMM && desc->bInterfaceSubClass == 2 && desc->bInterfaceProtocol == 0xff); } static int is_activesync(struct usb_interface_descriptor *desc) { return (desc->bInterfaceClass == USB_CLASS_MISC && desc->bInterfaceSubClass == 1 && desc->bInterfaceProtocol == 1); } static int is_wireless_rndis(struct usb_interface_descriptor *desc) { return (desc->bInterfaceClass == USB_CLASS_WIRELESS_CONTROLLER && desc->bInterfaceSubClass == 1 && desc->bInterfaceProtocol == 3); } static int is_novatel_rndis(struct usb_interface_descriptor *desc) { return (desc->bInterfaceClass == USB_CLASS_MISC && desc->bInterfaceSubClass == 4 && desc->bInterfaceProtocol == 1); } #else #define is_rndis(desc) 0 #define is_activesync(desc) 0 #define is_wireless_rndis(desc) 0 #define is_novatel_rndis(desc) 0 #endif static const u8 mbm_guid[16] = { 0xa3, 0x17, 0xa8, 0x8b, 0x04, 0x5e, 0x4f, 0x01, 0xa6, 0x07, 0xc0, 0xff, 0xcb, 0x7e, 0x39, 0x2a, }; void usbnet_cdc_update_filter(struct usbnet *dev) { struct net_device *net = dev->net; u16 cdc_filter = USB_CDC_PACKET_TYPE_DIRECTED | USB_CDC_PACKET_TYPE_BROADCAST; /* filtering on the device is an optional feature and not worth * the hassle so we just roughly care about snooping and if any * multicast is requested, we take every multicast */ if (net->flags & IFF_PROMISC) cdc_filter |= USB_CDC_PACKET_TYPE_PROMISCUOUS; if (!netdev_mc_empty(net) || (net->flags & IFF_ALLMULTI)) cdc_filter |= USB_CDC_PACKET_TYPE_ALL_MULTICAST; usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), USB_CDC_SET_ETHERNET_PACKET_FILTER, USB_TYPE_CLASS | USB_RECIP_INTERFACE, cdc_filter, dev->intf->cur_altsetting->desc.bInterfaceNumber, NULL, 0, USB_CTRL_SET_TIMEOUT ); } EXPORT_SYMBOL_GPL(usbnet_cdc_update_filter); /* We need to override usbnet_*_link_ksettings in bind() */ static const struct ethtool_ops cdc_ether_ethtool_ops = { .get_link = usbnet_get_link, .nway_reset = usbnet_nway_reset, .get_drvinfo = usbnet_get_drvinfo, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_ts_info = ethtool_op_get_ts_info, .get_link_ksettings = usbnet_get_link_ksettings_internal, .set_link_ksettings = NULL, }; /* probes control interface, claims data interface, collects the bulk * endpoints, activates data interface (if needed), maybe sets MTU. * all pure cdc, except for certain firmware workarounds, and knowing * that rndis uses one different rule. */ int usbnet_generic_cdc_bind(struct usbnet *dev, struct usb_interface *intf) { u8 *buf = intf->cur_altsetting->extra; int len = intf->cur_altsetting->extralen; struct usb_interface_descriptor *d; struct cdc_state *info = (void *) &dev->data; int status; int rndis; bool android_rndis_quirk = false; struct usb_driver *driver = driver_of(intf); struct usb_cdc_parsed_header header; if (sizeof(dev->data) < sizeof(*info)) return -EDOM; /* expect strict spec conformance for the descriptors, but * cope with firmware which stores them in the wrong place */ if (len == 0 && dev->udev->actconfig->extralen) { /* Motorola SB4100 (and others: Brad Hards says it's * from a Broadcom design) put CDC descriptors here */ buf = dev->udev->actconfig->extra; len = dev->udev->actconfig->extralen; dev_dbg(&intf->dev, "CDC descriptors on config\n"); } /* Maybe CDC descriptors are after the endpoint? This bug has * been seen on some 2Wire Inc RNDIS-ish products. */ if (len == 0) { struct usb_host_endpoint *hep; hep = intf->cur_altsetting->endpoint; if (hep) { buf = hep->extra; len = hep->extralen; } if (len) dev_dbg(&intf->dev, "CDC descriptors on endpoint\n"); } /* this assumes that if there's a non-RNDIS vendor variant * of cdc-acm, it'll fail RNDIS requests cleanly. */ rndis = (is_rndis(&intf->cur_altsetting->desc) || is_activesync(&intf->cur_altsetting->desc) || is_wireless_rndis(&intf->cur_altsetting->desc) || is_novatel_rndis(&intf->cur_altsetting->desc)); memset(info, 0, sizeof(*info)); info->control = intf; cdc_parse_cdc_header(&header, intf, buf, len); info->u = header.usb_cdc_union_desc; info->header = header.usb_cdc_header_desc; info->ether = header.usb_cdc_ether_desc; if (!info->u) { if (rndis) goto skip; else /* in that case a quirk is mandatory */ goto bad_desc; } /* we need a master/control interface (what we're * probed with) and a slave/data interface; union * descriptors sort this all out. */ info->control = usb_ifnum_to_if(dev->udev, info->u->bMasterInterface0); info->data = usb_ifnum_to_if(dev->udev, info->u->bSlaveInterface0); if (!info->control || !info->data) { dev_dbg(&intf->dev, "master #%u/%p slave #%u/%p\n", info->u->bMasterInterface0, info->control, info->u->bSlaveInterface0, info->data); /* fall back to hard-wiring for RNDIS */ if (rndis) { android_rndis_quirk = true; goto skip; } goto bad_desc; } if (info->control != intf) { dev_dbg(&intf->dev, "bogus CDC Union\n"); /* Ambit USB Cable Modem (and maybe others) * interchanges master and slave interface. */ if (info->data == intf) { info->data = info->control; info->control = intf; } else goto bad_desc; } /* some devices merge these - skip class check */ if (info->control == info->data) goto skip; /* a data interface altsetting does the real i/o */ d = &info->data->cur_altsetting->desc; if (d->bInterfaceClass != USB_CLASS_CDC_DATA) { dev_dbg(&intf->dev, "slave class %u\n", d->bInterfaceClass); goto bad_desc; } skip: /* Communication class functions with bmCapabilities are not * RNDIS. But some Wireless class RNDIS functions use * bmCapabilities for their own purpose. The failsafe is * therefore applied only to Communication class RNDIS * functions. The rndis test is redundant, but a cheap * optimization. */ if (rndis && is_rndis(&intf->cur_altsetting->desc) && header.usb_cdc_acm_descriptor && header.usb_cdc_acm_descriptor->bmCapabilities) { dev_dbg(&intf->dev, "ACM capabilities %02x, not really RNDIS?\n", header.usb_cdc_acm_descriptor->bmCapabilities); goto bad_desc; } if (header.usb_cdc_ether_desc && info->ether->wMaxSegmentSize) { dev->hard_mtu = le16_to_cpu(info->ether->wMaxSegmentSize); /* because of Zaurus, we may be ignoring the host * side link address we were given. */ } if (header.usb_cdc_mdlm_desc && memcmp(header.usb_cdc_mdlm_desc->bGUID, mbm_guid, 16)) { dev_dbg(&intf->dev, "GUID doesn't match\n"); goto bad_desc; } if (header.usb_cdc_mdlm_detail_desc && header.usb_cdc_mdlm_detail_desc->bLength < (sizeof(struct usb_cdc_mdlm_detail_desc) + 1)) { dev_dbg(&intf->dev, "Descriptor too short\n"); goto bad_desc; } /* Microsoft ActiveSync based and some regular RNDIS devices lack the * CDC descriptors, so we'll hard-wire the interfaces and not check * for descriptors. * * Some Android RNDIS devices have a CDC Union descriptor pointing * to non-existing interfaces. Ignore that and attempt the same * hard-wired 0 and 1 interfaces. */ if (rndis && (!info->u || android_rndis_quirk)) { info->control = usb_ifnum_to_if(dev->udev, 0); info->data = usb_ifnum_to_if(dev->udev, 1); if (!info->control || !info->data || info->control != intf) { dev_dbg(&intf->dev, "rndis: master #0/%p slave #1/%p\n", info->control, info->data); goto bad_desc; } } else if (!info->header || (!rndis && !info->ether)) { dev_dbg(&intf->dev, "missing cdc %s%s%sdescriptor\n", info->header ? "" : "header ", info->u ? "" : "union ", info->ether ? "" : "ether "); goto bad_desc; } /* claim data interface and set it up ... with side effects. * network traffic can't flow until an altsetting is enabled. */ if (info->data != info->control) { status = usb_driver_claim_interface(driver, info->data, dev); if (status < 0) return status; } status = usbnet_get_endpoints(dev, info->data); if (status < 0) { /* ensure immediate exit from usbnet_disconnect */ usb_set_intfdata(info->data, NULL); if (info->data != info->control) usb_driver_release_interface(driver, info->data); return status; } /* status endpoint: optional for CDC Ethernet, not RNDIS (or ACM) */ if (info->data != info->control) dev->status = NULL; if (info->control->cur_altsetting->desc.bNumEndpoints == 1) { struct usb_endpoint_descriptor *desc; dev->status = &info->control->cur_altsetting->endpoint[0]; desc = &dev->status->desc; if (!usb_endpoint_is_int_in(desc) || (le16_to_cpu(desc->wMaxPacketSize) < sizeof(struct usb_cdc_notification)) || !desc->bInterval) { dev_dbg(&intf->dev, "bad notification endpoint\n"); dev->status = NULL; } } if (rndis && !dev->status) { dev_dbg(&intf->dev, "missing RNDIS status endpoint\n"); usb_set_intfdata(info->data, NULL); usb_driver_release_interface(driver, info->data); return -ENODEV; } /* override ethtool_ops */ dev->net->ethtool_ops = &cdc_ether_ethtool_ops; return 0; bad_desc: dev_info(&dev->udev->dev, "bad CDC descriptors\n"); return -ENODEV; } EXPORT_SYMBOL_GPL(usbnet_generic_cdc_bind); /* like usbnet_generic_cdc_bind() but handles filter initialization * correctly */ int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf) { int rv; rv = usbnet_generic_cdc_bind(dev, intf); if (rv < 0) goto bail_out; /* Some devices don't initialise properly. In particular * the packet filter is not reset. There are devices that * don't do reset all the way. So the packet filter should * be set to a sane initial value. */ usbnet_cdc_update_filter(dev); bail_out: return rv; } EXPORT_SYMBOL_GPL(usbnet_ether_cdc_bind); void usbnet_cdc_unbind(struct usbnet *dev, struct usb_interface *intf) { struct cdc_state *info = (void *) &dev->data; struct usb_driver *driver = driver_of(intf); /* combined interface - nothing to do */ if (info->data == info->control) return; /* disconnect master --> disconnect slave */ if (intf == info->control && info->data) { /* ensure immediate exit from usbnet_disconnect */ usb_set_intfdata(info->data, NULL); usb_driver_release_interface(driver, info->data); info->data = NULL; } /* and vice versa (just in case) */ else if (intf == info->data && info->control) { /* ensure immediate exit from usbnet_disconnect */ usb_set_intfdata(info->control, NULL); usb_driver_release_interface(driver, info->control); info->control = NULL; } } EXPORT_SYMBOL_GPL(usbnet_cdc_unbind); /* Communications Device Class, Ethernet Control model * * Takes two interfaces. The DATA interface is inactive till an altsetting * is selected. Configuration data includes class descriptors. There's * an optional status endpoint on the control interface. * * This should interop with whatever the 2.4 "CDCEther.c" driver * (by Brad Hards) talked with, with more functionality. */ static void speed_change(struct usbnet *dev, __le32 *speeds) { dev->tx_speed = __le32_to_cpu(speeds[0]); dev->rx_speed = __le32_to_cpu(speeds[1]); } void usbnet_cdc_status(struct usbnet *dev, struct urb *urb) { struct usb_cdc_notification *event; if (urb->actual_length < sizeof(*event)) return; /* SPEED_CHANGE can get split into two 8-byte packets */ if (test_and_clear_bit(EVENT_STS_SPLIT, &dev->flags)) { speed_change(dev, (__le32 *) urb->transfer_buffer); return; } event = urb->transfer_buffer; switch (event->bNotificationType) { case USB_CDC_NOTIFY_NETWORK_CONNECTION: netif_dbg(dev, timer, dev->net, "CDC: carrier %s\n", event->wValue ? "on" : "off"); usbnet_link_change(dev, !!event->wValue, 0); break; case USB_CDC_NOTIFY_SPEED_CHANGE: /* tx/rx rates */ netif_dbg(dev, timer, dev->net, "CDC: speed change (len %d)\n", urb->actual_length); if (urb->actual_length != (sizeof(*event) + 8)) set_bit(EVENT_STS_SPLIT, &dev->flags); else speed_change(dev, (__le32 *) &event[1]); break; /* USB_CDC_NOTIFY_RESPONSE_AVAILABLE can happen too (e.g. RNDIS), * but there are no standard formats for the response data. */ default: netdev_err(dev->net, "CDC: unexpected notification %02x!\n", event->bNotificationType); break; } } EXPORT_SYMBOL_GPL(usbnet_cdc_status); int usbnet_cdc_bind(struct usbnet *dev, struct usb_interface *intf) { int status; struct cdc_state *info = (void *) &dev->data; BUILD_BUG_ON((sizeof(((struct usbnet *)0)->data) < sizeof(struct cdc_state))); status = usbnet_ether_cdc_bind(dev, intf); if (status < 0) return status; status = usbnet_get_ethernet_addr(dev, info->ether->iMACAddress); if (status < 0) { usb_set_intfdata(info->data, NULL); usb_driver_release_interface(driver_of(intf), info->data); return status; } return 0; } EXPORT_SYMBOL_GPL(usbnet_cdc_bind); static int usbnet_cdc_zte_bind(struct usbnet *dev, struct usb_interface *intf) { int status = usbnet_cdc_bind(dev, intf); if (!status && (dev->net->dev_addr[0] & 0x02)) eth_hw_addr_random(dev->net); return status; } /* Make sure packets have correct destination MAC address * * A firmware bug observed on some devices (ZTE MF823/831/910) is that the * device sends packets with a static, bogus, random MAC address (event if * device MAC address has been updated). Always set MAC address to that of the * device. */ int usbnet_cdc_zte_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { if (skb->len < ETH_HLEN || !(skb->data[0] & 0x02)) return 1; skb_reset_mac_header(skb); ether_addr_copy(eth_hdr(skb)->h_dest, dev->net->dev_addr); return 1; } EXPORT_SYMBOL_GPL(usbnet_cdc_zte_rx_fixup); /* Ensure correct link state * * Some devices (ZTE MF823/831/910) export two carrier on notifications when * connected. This causes the link state to be incorrect. Work around this by * always setting the state to off, then on. */ static void usbnet_cdc_zte_status(struct usbnet *dev, struct urb *urb) { struct usb_cdc_notification *event; if (urb->actual_length < sizeof(*event)) return; event = urb->transfer_buffer; if (event->bNotificationType != USB_CDC_NOTIFY_NETWORK_CONNECTION) { usbnet_cdc_status(dev, urb); return; } netif_dbg(dev, timer, dev->net, "CDC: carrier %s\n", event->wValue ? "on" : "off"); if (event->wValue && netif_carrier_ok(dev->net)) netif_carrier_off(dev->net); usbnet_link_change(dev, !!event->wValue, 0); } static const struct driver_info cdc_info = { .description = "CDC Ethernet Device", .flags = FLAG_ETHER | FLAG_POINTTOPOINT, .bind = usbnet_cdc_bind, .unbind = usbnet_cdc_unbind, .status = usbnet_cdc_status, .set_rx_mode = usbnet_cdc_update_filter, .manage_power = usbnet_manage_power, }; static const struct driver_info zte_cdc_info = { .description = "ZTE CDC Ethernet Device", .flags = FLAG_ETHER | FLAG_POINTTOPOINT, .bind = usbnet_cdc_zte_bind, .unbind = usbnet_cdc_unbind, .status = usbnet_cdc_zte_status, .set_rx_mode = usbnet_cdc_update_filter, .manage_power = usbnet_manage_power, .rx_fixup = usbnet_cdc_zte_rx_fixup, }; static const struct driver_info wwan_info = { .description = "Mobile Broadband Network Device", .flags = FLAG_WWAN, .bind = usbnet_cdc_bind, .unbind = usbnet_cdc_unbind, .status = usbnet_cdc_status, .set_rx_mode = usbnet_cdc_update_filter, .manage_power = usbnet_manage_power, }; /*-------------------------------------------------------------------------*/ #define HUAWEI_VENDOR_ID 0x12D1 #define NOVATEL_VENDOR_ID 0x1410 #define ZTE_VENDOR_ID 0x19D2 #define DELL_VENDOR_ID 0x413C #define REALTEK_VENDOR_ID 0x0bda #define SAMSUNG_VENDOR_ID 0x04e8 #define LENOVO_VENDOR_ID 0x17ef #define LINKSYS_VENDOR_ID 0x13b1 #define NVIDIA_VENDOR_ID 0x0955 #define HP_VENDOR_ID 0x03f0 #define MICROSOFT_VENDOR_ID 0x045e #define UBLOX_VENDOR_ID 0x1546 #define TPLINK_VENDOR_ID 0x2357 #define AQUANTIA_VENDOR_ID 0x2eca #define ASIX_VENDOR_ID 0x0b95 static const struct usb_device_id products[] = { /* BLACKLIST !! * * First blacklist any products that are egregiously nonconformant * with the CDC Ethernet specs. Minor braindamage we cope with; when * they're not even trying, needing a separate driver is only the first * of the differences to show up. */ #define ZAURUS_MASTER_INTERFACE \ .bInterfaceClass = USB_CLASS_COMM, \ .bInterfaceSubClass = USB_CDC_SUBCLASS_ETHERNET, \ .bInterfaceProtocol = USB_CDC_PROTO_NONE #define ZAURUS_FAKE_INTERFACE \ .bInterfaceClass = USB_CLASS_COMM, \ .bInterfaceSubClass = USB_CDC_SUBCLASS_MDLM, \ .bInterfaceProtocol = USB_CDC_PROTO_NONE /* SA-1100 based Sharp Zaurus ("collie"), or compatible; * wire-incompatible with true CDC Ethernet implementations. * (And, it seems, needlessly so...) */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8004, ZAURUS_MASTER_INTERFACE, .driver_info = 0, }, /* PXA-25x based Sharp Zaurii. Note that it seems some of these * (later models especially) may have shipped only with firmware * advertising false "CDC MDLM" compatibility ... but we're not * clear which models did that, so for now let's assume the worst. */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8005, /* A-300 */ ZAURUS_MASTER_INTERFACE, .driver_info = 0, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8005, /* A-300 */ ZAURUS_FAKE_INTERFACE, .driver_info = 0, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8006, /* B-500/SL-5600 */ ZAURUS_MASTER_INTERFACE, .driver_info = 0, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8006, /* B-500/SL-5600 */ ZAURUS_FAKE_INTERFACE, .driver_info = 0, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8007, /* C-700 */ ZAURUS_MASTER_INTERFACE, .driver_info = 0, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8007, /* C-700 */ ZAURUS_FAKE_INTERFACE, .driver_info = 0, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x9031, /* C-750 C-760 */ ZAURUS_MASTER_INTERFACE, .driver_info = 0, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x9032, /* SL-6000 */ ZAURUS_MASTER_INTERFACE, .driver_info = 0, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x9032, /* SL-6000 */ ZAURUS_FAKE_INTERFACE, .driver_info = 0, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, /* reported with some C860 units */ .idProduct = 0x9050, /* C-860 */ ZAURUS_MASTER_INTERFACE, .driver_info = 0, }, /* Olympus has some models with a Zaurus-compatible option. * R-1000 uses a FreeScale i.MXL cpu (ARMv4T) */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x07B4, .idProduct = 0x0F02, /* R-1000 */ ZAURUS_MASTER_INTERFACE, .driver_info = 0, }, /* LG Electronics VL600 wants additional headers on every frame */ { USB_DEVICE_AND_INTERFACE_INFO(0x1004, 0x61aa, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* Logitech Harmony 900 - uses the pseudo-MDLM (BLAN) driver */ { USB_DEVICE_AND_INTERFACE_INFO(0x046d, 0xc11f, USB_CLASS_COMM, USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* Novatel USB551L and MC551 - handled by qmi_wwan */ { USB_DEVICE_AND_INTERFACE_INFO(NOVATEL_VENDOR_ID, 0xB001, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* Novatel E362 - handled by qmi_wwan */ { USB_DEVICE_AND_INTERFACE_INFO(NOVATEL_VENDOR_ID, 0x9010, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* Dell Wireless 5800 (Novatel E362) - handled by qmi_wwan */ { USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, 0x8195, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* Dell Wireless 5800 (Novatel E362) - handled by qmi_wwan */ { USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, 0x8196, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* Dell Wireless 5804 (Novatel E371) - handled by qmi_wwan */ { USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, 0x819b, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* Novatel Expedite E371 - handled by qmi_wwan */ { USB_DEVICE_AND_INTERFACE_INFO(NOVATEL_VENDOR_ID, 0x9011, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* HP lt2523 (Novatel E371) - handled by qmi_wwan */ { USB_DEVICE_AND_INTERFACE_INFO(HP_VENDOR_ID, 0x421d, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* AnyDATA ADU960S - handled by qmi_wwan */ { USB_DEVICE_AND_INTERFACE_INFO(0x16d5, 0x650a, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* Huawei E1820 - handled by qmi_wwan */ { USB_DEVICE_INTERFACE_NUMBER(HUAWEI_VENDOR_ID, 0x14ac, 1), .driver_info = 0, }, /* Realtek RTL8153 Based USB 3.0 Ethernet Adapters */ { USB_DEVICE_AND_INTERFACE_INFO(REALTEK_VENDOR_ID, 0x8153, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* Lenovo Powered USB-C Travel Hub (4X90S92381, based on Realtek RTL8153) */ { USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0x721e, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* Aquantia AQtion USB to 5GbE Controller (based on AQC111U) */ { USB_DEVICE_AND_INTERFACE_INFO(AQUANTIA_VENDOR_ID, 0xc101, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* ASIX USB 3.1 Gen1 to 5G Multi-Gigabit Ethernet Adapter(based on AQC111U) */ { USB_DEVICE_AND_INTERFACE_INFO(ASIX_VENDOR_ID, 0x2790, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* ASIX USB 3.1 Gen1 to 2.5G Multi-Gigabit Ethernet Adapter(based on AQC112U) */ { USB_DEVICE_AND_INTERFACE_INFO(ASIX_VENDOR_ID, 0x2791, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* USB-C 3.1 to 5GBASE-T Ethernet Adapter (based on AQC111U) */ { USB_DEVICE_AND_INTERFACE_INFO(0x20f4, 0xe05a, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* QNAP QNA-UC5G1T USB to 5GbE Adapter (based on AQC111U) */ { USB_DEVICE_AND_INTERFACE_INFO(0x1c04, 0x0015, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = 0, }, /* WHITELIST!!! * * CDC Ether uses two interfaces, not necessarily consecutive. * We match the main interface, ignoring the optional device * class so we could handle devices that aren't exclusively * CDC ether. * * NOTE: this match must come AFTER entries blacklisting devices * because of bugs/quirks in a given product (like Zaurus, above). */ { /* ZTE (Vodafone) K3805-Z */ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1003, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* ZTE (Vodafone) K3806-Z */ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1015, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* ZTE (Vodafone) K4510-Z */ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1173, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* ZTE (Vodafone) K3770-Z */ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1177, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* ZTE (Vodafone) K3772-Z */ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1181, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* Telit modules */ USB_VENDOR_AND_INTERFACE_INFO(0x1bc7, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (kernel_ulong_t) &wwan_info, }, { /* Dell DW5580 modules */ USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, 0x81ba, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (kernel_ulong_t)&wwan_info, }, { /* Huawei ME906 and ME909 */ USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0x15c1, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* ZTE modules */ USB_VENDOR_AND_INTERFACE_INFO(ZTE_VENDOR_ID, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&zte_cdc_info, }, { /* U-blox TOBY-L2 */ USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1143, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* U-blox SARA-U2 */ USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1104, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* U-blox LARA-R6 01B */ USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1313, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* U-blox LARA-L6 */ USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1343, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* Cinterion PLS8 modem by GEMALTO */ USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x0061, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* Cinterion AHS3 modem by GEMALTO */ USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x0055, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* Cinterion PLS62-W modem by GEMALTO/THALES */ USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x005b, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* Cinterion PLS83/PLS63 modem by GEMALTO/THALES */ USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x0069, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long) &cdc_info, }, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { /* Various Huawei modems with a network port like the UMG1831 */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, 255), .driver_info = (unsigned long)&wwan_info, }, { }, /* END */ }; MODULE_DEVICE_TABLE(usb, products); static struct usb_driver cdc_driver = { .name = "cdc_ether", .id_table = products, .probe = usbnet_probe, .disconnect = usbnet_disconnect, .suspend = usbnet_suspend, .resume = usbnet_resume, .reset_resume = usbnet_resume, .supports_autosuspend = 1, .disable_hub_initiated_lpm = 1, }; module_usb_driver(cdc_driver); MODULE_AUTHOR("David Brownell"); MODULE_DESCRIPTION("USB CDC Ethernet devices"); MODULE_LICENSE("GPL"); |
1542 1542 401 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 | // SPDX-License-Identifier: GPL-2.0-or-later /* * vrf.c: device driver to encapsulate a VRF space * * Copyright (c) 2015 Cumulus Networks. All rights reserved. * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * Based on dummy, team and ipvlan drivers */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ip.h> #include <linux/init.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <linux/u64_stats_sync.h> #include <linux/hashtable.h> #include <linux/spinlock_types.h> #include <linux/inetdevice.h> #include <net/arp.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #include <net/fib_rules.h> #include <net/sch_generic.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack.h> #define DRV_NAME "vrf" #define DRV_VERSION "1.1" #define FIB_RULE_PREF 1000 /* default preference for FIB rules */ #define HT_MAP_BITS 4 #define HASH_INITVAL ((u32)0xcafef00d) struct vrf_map { DECLARE_HASHTABLE(ht, HT_MAP_BITS); spinlock_t vmap_lock; /* shared_tables: * count how many distinct tables do not comply with the strict mode * requirement. * shared_tables value must be 0 in order to enable the strict mode. * * example of the evolution of shared_tables: * | time * add vrf0 --> table 100 shared_tables = 0 | t0 * add vrf1 --> table 101 shared_tables = 0 | t1 * add vrf2 --> table 100 shared_tables = 1 | t2 * add vrf3 --> table 100 shared_tables = 1 | t3 * add vrf4 --> table 101 shared_tables = 2 v t4 * * shared_tables is a "step function" (or "staircase function") * and it is increased by one when the second vrf is associated to a * table. * * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. * * at t3, another dev (vrf3) is bound to the same table 100 but the * value of shared_tables is still 1. * This means that no matter how many new vrfs will register on the * table 100, the shared_tables will not increase (considering only * table 100). * * at t4, vrf4 is bound to table 101, and shared_tables = 2. * * Looking at the value of shared_tables we can immediately know if * the strict_mode can or cannot be enforced. Indeed, strict_mode * can be enforced iff shared_tables = 0. * * Conversely, shared_tables is decreased when a vrf is de-associated * from a table with exactly two associated vrfs. */ u32 shared_tables; bool strict_mode; }; struct vrf_map_elem { struct hlist_node hnode; struct list_head vrf_list; /* VRFs registered to this table */ u32 table_id; int users; int ifindex; }; static unsigned int vrf_net_id; /* per netns vrf data */ struct netns_vrf { /* protected by rtnl lock */ bool add_fib_rules; struct vrf_map vmap; struct ctl_table_header *ctl_hdr; }; struct net_vrf { struct rtable __rcu *rth; struct rt6_info __rcu *rt6; #if IS_ENABLED(CONFIG_IPV6) struct fib6_table *fib6_table; #endif u32 tb_id; struct list_head me_list; /* entry in vrf_map_elem */ int ifindex; }; static void vrf_rx_stats(struct net_device *dev, int len) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); dstats->rx_packets++; dstats->rx_bytes += len; u64_stats_update_end(&dstats->syncp); } static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { vrf_dev->stats.tx_errors++; kfree_skb(skb); } static void vrf_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { int i; for_each_possible_cpu(i) { const struct pcpu_dstats *dstats; u64 tbytes, tpkts, tdrops, rbytes, rpkts; unsigned int start; dstats = per_cpu_ptr(dev->dstats, i); do { start = u64_stats_fetch_begin(&dstats->syncp); tbytes = dstats->tx_bytes; tpkts = dstats->tx_packets; tdrops = dstats->tx_drops; rbytes = dstats->rx_bytes; rpkts = dstats->rx_packets; } while (u64_stats_fetch_retry(&dstats->syncp, start)); stats->tx_bytes += tbytes; stats->tx_packets += tpkts; stats->tx_dropped += tdrops; stats->rx_bytes += rbytes; stats->rx_packets += rpkts; } } static struct vrf_map *netns_vrf_map(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); return &nn_vrf->vmap; } static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) { return netns_vrf_map(dev_net(dev)); } static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) { struct list_head *me_head = &me->vrf_list; struct net_vrf *vrf; if (list_empty(me_head)) return -ENODEV; vrf = list_first_entry(me_head, struct net_vrf, me_list); return vrf->ifindex; } static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) { struct vrf_map_elem *me; me = kmalloc(sizeof(*me), flags); if (!me) return NULL; return me; } static void vrf_map_elem_free(struct vrf_map_elem *me) { kfree(me); } static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, int ifindex, int users) { me->table_id = table_id; me->ifindex = ifindex; me->users = users; INIT_LIST_HEAD(&me->vrf_list); } static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, u32 table_id) { struct vrf_map_elem *me; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_for_each_possible(vmap->ht, me, hnode, key) { if (me->table_id == table_id) return me; } return NULL; } static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) { u32 table_id = me->table_id; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_add(vmap->ht, &me->hnode, key); } static void vrf_map_del_elem(struct vrf_map_elem *me) { hash_del(&me->hnode); } static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) { spin_lock(&vmap->vmap_lock); } static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) { spin_unlock(&vmap->vmap_lock); } /* called with rtnl lock held */ static int vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); struct vrf_map_elem *new_me, *me; u32 table_id = vrf->tb_id; bool free_new_me = false; int users; int res; /* we pre-allocate elements used in the spin-locked section (so that we * keep the spinlock as short as possible). */ new_me = vrf_map_elem_alloc(GFP_KERNEL); if (!new_me) return -ENOMEM; vrf_map_elem_init(new_me, table_id, dev->ifindex, 0); vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) { me = new_me; vrf_map_add_elem(vmap, me); goto link_vrf; } /* we already have an entry in the vrf_map, so it means there is (at * least) a vrf registered on the specific table. */ free_new_me = true; if (vmap->strict_mode) { /* vrfs cannot share the same table */ NL_SET_ERR_MSG(extack, "Table is used by another VRF"); res = -EBUSY; goto unlock; } link_vrf: users = ++me->users; if (users == 2) ++vmap->shared_tables; list_add(&vrf->me_list, &me->vrf_list); res = 0; unlock: vrf_map_unlock(vmap); /* clean-up, if needed */ if (free_new_me) vrf_map_elem_free(new_me); return res; } /* called with rtnl lock held */ static void vrf_map_unregister_dev(struct net_device *dev) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); u32 table_id = vrf->tb_id; struct vrf_map_elem *me; int users; vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) goto unlock; list_del(&vrf->me_list); users = --me->users; if (users == 1) { --vmap->shared_tables; } else if (users == 0) { vrf_map_del_elem(me); /* no one will refer to this element anymore */ vrf_map_elem_free(me); } unlock: vrf_map_unlock(vmap); } /* return the vrf device index associated with the table_id */ static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) { struct vrf_map *vmap = netns_vrf_map(net); struct vrf_map_elem *me; int ifindex; vrf_map_lock(vmap); if (!vmap->strict_mode) { ifindex = -EPERM; goto unlock; } me = vrf_map_lookup_elem(vmap, table_id); if (!me) { ifindex = -ENODEV; goto unlock; } ifindex = vrf_map_elem_get_vrf_ifindex(me); unlock: vrf_map_unlock(vmap); return ifindex; } /* by default VRF devices do not have a qdisc and are expected * to be created with only a single queue. */ static bool qdisc_tx_is_default(const struct net_device *dev) { struct netdev_queue *txq; struct Qdisc *qdisc; if (dev->num_tx_queues > 1) return false; txq = netdev_get_tx_queue(dev, 0); qdisc = rcu_access_pointer(txq->qdisc); return !qdisc->enqueue; } /* Local traffic destined to local address. Reinsert the packet to rx * path, similar to loopback handling. */ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst) { int len = skb->len; skb_orphan(skb); skb_dst_set(skb, dst); /* set pkt_type to avoid skb hitting packet taps twice - * once on Tx and again in Rx processing */ skb->pkt_type = PACKET_LOOPBACK; skb->protocol = eth_type_trans(skb, dev); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) vrf_rx_stats(dev, len); else this_cpu_inc(dev->dstats->rx_drops); return NETDEV_TX_OK; } static void vrf_nf_set_untracked(struct sk_buff *skb) { if (skb_get_nfct(skb) == 0) nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } static void vrf_nf_reset_ct(struct sk_buff *skb) { if (skb_get_nfct(skb) == IP_CT_UNTRACKED) nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) static int vrf_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { const struct ipv6hdr *iph; struct net *net = dev_net(skb->dev); struct flowi6 fl6; int ret = NET_XMIT_DROP; struct dst_entry *dst; struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) goto err; iph = ipv6_hdr(skb); memset(&fl6, 0, sizeof(fl6)); /* needed to match OIF rule */ fl6.flowi6_l3mdev = dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = iph->nexthdr; dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst) || dst == dst_null) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (dst->dev == dev) return vrf_local_xmit(skb, dev, dst); skb_dst_set(skb, dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; return ret; err: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #else static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #endif /* based on ip_local_out; can't use it b/c the dst is switched pointing to us */ static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { struct iphdr *ip4h; int ret = NET_XMIT_DROP; struct flowi4 fl4; struct net *net = dev_net(vrf_dev); struct rtable *rt; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) goto err; ip4h = ip_hdr(skb); memset(&fl4, 0, sizeof(fl4)); /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; fl4.flowi4_tos = RT_TOS(ip4h->tos); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (rt->dst.dev == vrf_dev) return vrf_local_xmit(skb, vrf_dev, &rt->dst); skb_dst_set(skb, &rt->dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); if (!ip4h->saddr) { ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, RT_SCOPE_LINK); } memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; out: return ret; err: vrf_tx_error(vrf_dev, skb); goto out; } static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) { switch (skb->protocol) { case htons(ETH_P_IP): return vrf_process_v4_outbound(skb, dev); case htons(ETH_P_IPV6): return vrf_process_v6_outbound(skb, dev); default: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } } static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { int len = skb->len; netdev_tx_t ret = is_ip_tx_frame(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); dstats->tx_packets++; dstats->tx_bytes += len; u64_stats_update_end(&dstats->syncp); } else { this_cpu_inc(dev->dstats->tx_drops); } return ret; } static void vrf_finish_direct(struct sk_buff *skb) { struct net_device *vrf_dev = skb->dev; if (!list_empty(&vrf_dev->ptype_all) && likely(skb_headroom(skb) >= ETH_HLEN)) { struct ethhdr *eth = skb_push(skb, ETH_HLEN); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth_zero_addr(eth->h_dest); eth->h_proto = skb->protocol; dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, ETH_HLEN); } vrf_nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) /* modelled after ip6_finish_output2 */ static int vrf_finish_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; const struct in6_addr *nexthop; struct neighbour *neigh; int ret; vrf_nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; rcu_read_lock(); nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); rcu_read_unlock(); return ret; } rcu_read_unlock(); IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } /* modelled after ip6_output */ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb_dst(skb)->dev, vrf_finish_output6, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rt6_info *rt6; rcu_read_lock(); rt6 = rcu_dereference(vrf->rt6); if (likely(rt6)) { dst = &rt6->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output6_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip6_local_out(net, sk, skb); } static int vrf_output6_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IPV6); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output6_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip6_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); if (likely(err == 1)) err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert link scope packets */ if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return vrf_ip6_out_direct(vrf_dev, sk, skb); return vrf_ip6_out_redirect(vrf_dev, skb); } /* holding rtnl */ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { struct rt6_info *rt6 = rtnl_dereference(vrf->rt6); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rt6, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rt6) { dst = &rt6->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rt6_create(struct net_device *dev) { int flags = DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct rt6_info *rt6; int rc = -ENOMEM; /* IPv6 can be CONFIG enabled and then disabled runtime */ if (!ipv6_mod_enabled()) return 0; vrf->fib6_table = fib6_new_table(net, vrf->tb_id); if (!vrf->fib6_table) goto out; /* create a dst for routing packets out a VRF device */ rt6 = ip6_dst_alloc(net, dev, flags); if (!rt6) goto out; rt6->dst.output = vrf_output6; rcu_assign_pointer(vrf->rt6, rt6); rc = 0; out: return rc; } #else static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { return skb; } static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { } static int vrf_rt6_create(struct net_device *dev) { return 0; } #endif /* modelled after ip_finish_output2 */ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; vrf_nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) { dev->stats.tx_errors++; return -ENOMEM; } } rcu_read_lock(); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int ret; sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ ret = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock(); return ret; } rcu_read_unlock(); vrf_tx_error(skb->dev, skb); return -EINVAL; } static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, vrf_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rtable *rth; rcu_read_lock(); rth = rcu_dereference(vrf->rth); if (likely(rth)) { dst = &rth->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip_local_out(net, sk, skb); } static int vrf_output_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IP); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip_out_direct_finish); if (likely(err == 1)) err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert multicast or local broadcast */ if (ipv4_is_multicast(ip_hdr(skb)->daddr) || ipv4_is_lbcast(ip_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return vrf_ip_out_direct(vrf_dev, sk, skb); return vrf_ip_out_redirect(vrf_dev, skb); } /* called with rcu lock held */ static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_out(vrf_dev, sk, skb); case AF_INET6: return vrf_ip6_out(vrf_dev, sk, skb); } return skb; } /* holding rtnl */ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) { struct rtable *rth = rtnl_dereference(vrf->rth); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rth, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rth) { dst = &rth->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rtable_create(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); struct rtable *rth; if (!fib_new_table(dev_net(dev), vrf->tb_id)) return -ENOMEM; /* create a dst for routing packets out through a VRF device */ rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1); if (!rth) return -ENOMEM; rth->dst.output = vrf_output; rcu_assign_pointer(vrf->rth, rth); return 0; } /**************************** device handling ********************/ /* cycle interface to flush neighbor cache and move routes across tables */ static void cycle_netdev(struct net_device *dev, struct netlink_ext_ack *extack) { unsigned int flags = dev->flags; int ret; if (!netif_running(dev)) return; ret = dev_change_flags(dev, flags & ~IFF_UP, extack); if (ret >= 0) ret = dev_change_flags(dev, flags, extack); if (ret < 0) { netdev_err(dev, "Failed to cycle device %s; route tables might be wrong!\n", dev->name); } } static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { int ret; /* do not allow loopback device to be enslaved to a VRF. * The vrf device acts as the loopback for the vrf. */ if (port_dev == dev_net(dev)->loopback_dev) { NL_SET_ERR_MSG(extack, "Can not enslave loopback device to a VRF"); return -EOPNOTSUPP; } port_dev->priv_flags |= IFF_L3MDEV_SLAVE; ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); if (ret < 0) goto err; cycle_netdev(port_dev, extack); return 0; err: port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; return ret; } static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { if (netif_is_l3_master(port_dev)) { NL_SET_ERR_MSG(extack, "Can not enslave an L3 master device to a VRF"); return -EINVAL; } if (netif_is_l3_slave(port_dev)) return -EINVAL; return do_vrf_add_slave(dev, port_dev, extack); } /* inverse of do_vrf_add_slave */ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { netdev_upper_dev_unlink(port_dev, dev); port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; cycle_netdev(port_dev, NULL); return 0; } static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { return do_vrf_del_slave(dev, port_dev); } static void vrf_dev_uninit(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); vrf_rtable_release(dev, vrf); vrf_rt6_release(dev, vrf); } static int vrf_dev_init(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); /* create the default dst which points back to us */ if (vrf_rtable_create(dev) != 0) goto out_nomem; if (vrf_rt6_create(dev) != 0) goto out_rth; dev->flags = IFF_MASTER | IFF_NOARP; /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; netdev_lockdep_set_classes(dev); return 0; out_rth: vrf_rtable_release(dev, vrf); out_nomem: return -ENOMEM; } static const struct net_device_ops vrf_netdev_ops = { .ndo_init = vrf_dev_init, .ndo_uninit = vrf_dev_uninit, .ndo_start_xmit = vrf_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_get_stats64 = vrf_get_stats64, .ndo_add_slave = vrf_add_slave, .ndo_del_slave = vrf_del_slave, }; static u32 vrf_fib_table(const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return vrf->tb_id; } static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook, struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1) skb = NULL; /* kfree_skb(skb) handled by nf code */ return skb; } static int vrf_prepare_mac_header(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto) { struct ethhdr *eth; int err; /* in general, we do not know if there is enough space in the head of * the packet for hosting the mac header. */ err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev)); if (unlikely(err)) /* no space in the skb head */ return -ENOBUFS; __skb_push(skb, ETH_HLEN); eth = (struct ethhdr *)skb->data; skb_reset_mac_header(skb); skb_reset_mac_len(skb); /* we set the ethernet destination and the source addresses to the * address of the VRF device. */ ether_addr_copy(eth->h_dest, vrf_dev->dev_addr); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth->h_proto = htons(proto); /* the destination address of the Ethernet frame corresponds to the * address set on the VRF interface; therefore, the packet is intended * to be processed locally. */ skb->protocol = eth->h_proto; skb->pkt_type = PACKET_HOST; skb_postpush_rcsum(skb, skb->data, ETH_HLEN); skb_pull_inline(skb, ETH_HLEN); return 0; } /* prepare and add the mac header to the packet if it was not set previously. * In this way, packet sniffers such as tcpdump can parse the packet correctly. * If the mac header was already set, the original mac header is left * untouched and the function returns immediately. */ static int vrf_add_mac_header_if_unset(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto, struct net_device *orig_dev) { if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev)) return 0; return vrf_prepare_mac_header(skb, vrf_dev, proto); } #if IS_ENABLED(CONFIG_IPV6) /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails * for multiple next protocols (e.g., NEXTHDR_HOP). But it is * a start. */ static bool ipv6_ndisc_frame(const struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); bool rc = false; if (iph->nexthdr == NEXTHDR_ICMP) { const struct icmp6hdr *icmph; struct icmp6hdr _icmph; icmph = skb_header_pointer(skb, sizeof(*iph), sizeof(_icmph), &_icmph); if (!icmph) goto out; switch (icmph->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: rc = true; break; } } out: return rc; } static struct rt6_info *vrf_ip6_route_lookup(struct net *net, const struct net_device *dev, struct flowi6 *fl6, int ifindex, const struct sk_buff *skb, int flags) { struct net_vrf *vrf = netdev_priv(dev); return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags); } static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, int ifindex) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_iif = ifindex, .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), }; struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) return; if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst)) return; skb_dst_set(skb, &rt6->dst); } static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For non-loopback strict packets, determine the dst using the original * ifindex. */ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; if (skb->pkt_type == PACKET_LOOPBACK) skb->pkt_type = PACKET_HOST; else vrf_ip6_input_dst(skb, vrf_dev, orig_iif); goto out; } /* if packet is NDISC then keep the ingress interface */ if (!is_ndisc) { struct net_device *orig_dev = skb->dev; vrf_rx_stats(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IPV6, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } IP6CB(skb)->flags |= IP6SKB_L3SLAVE; } if (need_strict) vrf_ip6_input_dst(skb, vrf_dev, orig_iif); skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } #else static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { return skb; } #endif static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_device *orig_dev = skb->dev; skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out; /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb */ if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_HOST; goto out; } vrf_rx_stats(vrf_dev, skb->len); if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } /* called with rcu lock held */ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_rcv(vrf_dev, skb); case AF_INET6: return vrf_ip6_rcv(vrf_dev, skb); } return skb; } #if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct * Note: Caller to this function must hold rcu_read_lock() and no refcnt * is taken on the dst by this function. */ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, struct flowi6 *fl6) { struct net *net = dev_net(dev); int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt; /* VRF device does not have a link-local address and * sending packets to link-local or mcast addresses over * a VRF device does not make sense */ if (fl6->flowi6_oif == dev->ifindex) { dst = &net->ipv6.ip6_null_entry->dst; return dst; } if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); if (rt) dst = &rt->dst; return dst; } #endif static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_fib_table = vrf_fib_table, .l3mdev_l3_rcv = vrf_l3_rcv, .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_link_scope_lookup = vrf_link_scope_lookup, #endif }; static void vrf_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static const struct ethtool_ops vrf_ethtool_ops = { .get_drvinfo = vrf_get_drvinfo, }; static inline size_t vrf_fib_rule_nl_size(void) { size_t sz; sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ return sz; } static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) { struct fib_rule_hdr *frh; struct nlmsghdr *nlh; struct sk_buff *skb; int err; if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && !ipv6_mod_enabled()) return 0; skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0); if (!nlh) goto nla_put_failure; /* rule only needs to appear once */ nlh->nlmsg_flags |= NLM_F_EXCL; frh = nlmsg_data(nlh); memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) goto nla_put_failure; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF)) goto nla_put_failure; nlmsg_end(skb, nlh); /* fib_nl_{new,del}rule handling looks for net from skb->sk */ skb->sk = dev_net(dev)->rtnl; if (add_it) { err = fib_nl_newrule(skb, nlh, NULL); if (err == -EEXIST) err = 0; } else { err = fib_nl_delrule(skb, nlh, NULL); if (err == -ENOENT) err = 0; } nlmsg_free(skb); return err; nla_put_failure: nlmsg_free(skb); return -EMSGSIZE; } static int vrf_add_fib_rules(const struct net_device *dev) { int err; err = vrf_fib_rule(dev, AF_INET, true); if (err < 0) goto out_err; err = vrf_fib_rule(dev, AF_INET6, true); if (err < 0) goto ipv6_err; #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true); if (err < 0) goto ipmr_err; #endif #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); if (err < 0) goto ip6mr_err; #endif return 0; #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) ip6mr_err: vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); #endif #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); #endif ipv6_err: vrf_fib_rule(dev, AF_INET, false); out_err: netdev_err(dev, "Failed to add FIB rules.\n"); return err; } static void vrf_setup(struct net_device *dev) { ether_setup(dev); /* Initialize the device structure. */ dev->netdev_ops = &vrf_netdev_ops; dev->l3mdev_ops = &vrf_l3mdev_ops; dev->ethtool_ops = &vrf_ethtool_ops; dev->needs_free_netdev = true; /* Fill in device structure with ethernet-generic values. */ eth_hw_addr_random(dev); /* don't acquire vrf device's netif_tx_lock when transmitting */ dev->features |= NETIF_F_LLTX; /* don't allow vrf devices to change network namespaces. */ dev->features |= NETIF_F_NETNS_LOCAL; /* does not make sense for a VLAN to be added to a vrf device */ dev->features |= NETIF_F_VLAN_CHALLENGED; /* enable offload features */ dev->features |= NETIF_F_GSO_SOFTWARE; dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC; dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->hw_features = dev->features; dev->hw_enc_features = dev->features; /* default to no qdisc; user can add if desired */ dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_NO_RX_HANDLER; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; /* VRF devices do not care about MTU, but if the MTU is set * too low then the ipv4 and ipv6 protocols are disabled * which breaks networking. */ dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU; dev->mtu = dev->max_mtu; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EADDRNOTAVAIL; } } return 0; } static void vrf_dellink(struct net_device *dev, struct list_head *head) { struct net_device *port_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, port_dev, iter) vrf_del_slave(dev, port_dev); vrf_map_unregister_dev(dev); unregister_netdevice_queue(dev, head); } static int vrf_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_vrf *vrf = netdev_priv(dev); struct netns_vrf *nn_vrf; bool *add_fib_rules; struct net *net; int err; if (!data || !data[IFLA_VRF_TABLE]) { NL_SET_ERR_MSG(extack, "VRF table id is missing"); return -EINVAL; } vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); if (vrf->tb_id == RT_TABLE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE], "Invalid VRF table id"); return -EINVAL; } dev->priv_flags |= IFF_L3MDEV_MASTER; err = register_netdevice(dev); if (err) goto out; /* mapping between table_id and vrf; * note: such binding could not be done in the dev init function * because dev->ifindex id is not available yet. */ vrf->ifindex = dev->ifindex; err = vrf_map_register_dev(dev, extack); if (err) { unregister_netdevice(dev); goto out; } net = dev_net(dev); nn_vrf = net_generic(net, vrf_net_id); add_fib_rules = &nn_vrf->add_fib_rules; if (*add_fib_rules) { err = vrf_add_fib_rules(dev); if (err) { vrf_map_unregister_dev(dev); unregister_netdevice(dev); goto out; } *add_fib_rules = false; } out: return err; } static size_t vrf_nl_getsize(const struct net_device *dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ } static int vrf_fillinfo(struct sk_buff *skb, const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); } static size_t vrf_get_slave_size(const struct net_device *bond_dev, const struct net_device *slave_dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */ } static int vrf_fill_slave_info(struct sk_buff *skb, const struct net_device *vrf_dev, const struct net_device *slave_dev) { struct net_vrf *vrf = netdev_priv(vrf_dev); if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id)) return -EMSGSIZE; return 0; } static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { [IFLA_VRF_TABLE] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vrf_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct net_vrf), .get_size = vrf_nl_getsize, .policy = vrf_nl_policy, .validate = vrf_validate, .fill_info = vrf_fillinfo, .get_slave_size = vrf_get_slave_size, .fill_slave_info = vrf_fill_slave_info, .newlink = vrf_newlink, .dellink = vrf_dellink, .setup = vrf_setup, .maxtype = IFLA_VRF_MAX, }; static int vrf_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* only care about unregister events to drop slave references */ if (event == NETDEV_UNREGISTER) { struct net_device *vrf_dev; if (!netif_is_l3_slave(dev)) goto out; vrf_dev = netdev_master_upper_dev_get(dev); vrf_del_slave(vrf_dev, dev); } out: return NOTIFY_DONE; } static struct notifier_block vrf_notifier_block __read_mostly = { .notifier_call = vrf_device_event, }; static int vrf_map_init(struct vrf_map *vmap) { spin_lock_init(&vmap->vmap_lock); hash_init(vmap->ht); vmap->strict_mode = false; return 0; } #ifdef CONFIG_SYSCTL static bool vrf_strict_mode(struct vrf_map *vmap) { bool strict_mode; vrf_map_lock(vmap); strict_mode = vmap->strict_mode; vrf_map_unlock(vmap); return strict_mode; } static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode) { bool *cur_mode; int res = 0; vrf_map_lock(vmap); cur_mode = &vmap->strict_mode; if (*cur_mode == new_mode) goto unlock; if (*cur_mode) { /* disable strict mode */ *cur_mode = false; } else { if (vmap->shared_tables) { /* we cannot allow strict_mode because there are some * vrfs that share one or more tables. */ res = -EBUSY; goto unlock; } /* no tables are shared among vrfs, so we can go back * to 1:1 association between a vrf with its table. */ *cur_mode = true; } unlock: vrf_map_unlock(vmap); return res; } static int vrf_shared_table_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->extra1; struct vrf_map *vmap = netns_vrf_map(net); int proc_strict_mode = 0; struct ctl_table tmp = { .procname = table->procname, .data = &proc_strict_mode, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; int ret; if (!write) proc_strict_mode = vrf_strict_mode(vmap); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode); return ret; } static const struct ctl_table vrf_table[] = { { .procname = "strict_mode", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = vrf_shared_table_handler, /* set by the vrf_netns_init */ .extra1 = NULL, }, }; static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { struct ctl_table *table; table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL); if (!table) return -ENOMEM; /* init the extra1 parameter with the reference to current netns */ table[0].extra1 = net; nn_vrf->ctl_hdr = register_net_sysctl_sz(net, "net/vrf", table, ARRAY_SIZE(vrf_table)); if (!nn_vrf->ctl_hdr) { kfree(table); return -ENOMEM; } return 0; } static void vrf_netns_exit_sysctl(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); struct ctl_table *table; table = nn_vrf->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(nn_vrf->ctl_hdr); kfree(table); } #else static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { return 0; } static void vrf_netns_exit_sysctl(struct net *net) { } #endif /* Initialize per network namespace state */ static int __net_init vrf_netns_init(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); nn_vrf->add_fib_rules = true; vrf_map_init(&nn_vrf->vmap); return vrf_netns_init_sysctl(net, nn_vrf); } static void __net_exit vrf_netns_exit(struct net *net) { vrf_netns_exit_sysctl(net); } static struct pernet_operations vrf_net_ops __net_initdata = { .init = vrf_netns_init, .exit = vrf_netns_exit, .id = &vrf_net_id, .size = sizeof(struct netns_vrf), }; static int __init vrf_init_module(void) { int rc; register_netdevice_notifier(&vrf_notifier_block); rc = register_pernet_subsys(&vrf_net_ops); if (rc < 0) goto error; rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); if (rc < 0) goto unreg_pernet; rc = rtnl_link_register(&vrf_link_ops); if (rc < 0) goto table_lookup_unreg; return 0; table_lookup_unreg: l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); unreg_pernet: unregister_pernet_subsys(&vrf_net_ops); error: unregister_netdevice_notifier(&vrf_notifier_block); return rc; } module_init(vrf_init_module); MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); MODULE_VERSION(DRV_VERSION); |
4 3 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> #include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; struct net *net = xt_net(par); struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP)) return NF_DROP; th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); if (th == NULL) return NF_DROP; if (!synproxy_parse_options(skb, par->thoff, th, &opts)) return NF_DROP; if (th->syn && !(th->ack || th->fin || th->rst)) { /* Initial SYN from client */ this_cpu_inc(snet->stats->syn_received); if (th->ece && th->cwr) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; opts.mss_encode = opts.mss_option; opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); synproxy_send_client_synack(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } return XT_CONTINUE; } static int synproxy_tg4_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); const struct ipt_entry *e = par->entryinfo; int err; if (e->ip.proto != IPPROTO_TCP || e->ip.invflags & XT_INV_PROTO) return -EINVAL; err = nf_ct_netns_get(par->net, par->family); if (err) return err; err = nf_synproxy_ipv4_init(snet, par->net); if (err) { nf_ct_netns_put(par->net, par->family); return err; } return err; } static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); nf_synproxy_ipv4_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg4_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV4, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg4, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg4_check, .destroy = synproxy_tg4_destroy, .me = THIS_MODULE, }; static int __init synproxy_tg4_init(void) { return xt_register_target(&synproxy_tg4_reg); } static void __exit synproxy_tg4_exit(void) { xt_unregister_target(&synproxy_tg4_reg); } module_init(synproxy_tg4_init); module_exit(synproxy_tg4_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Intercept TCP connections and establish them using syncookies"); |
150 150 61 61 61 44 10 25 26 10 8 38 2 8 44 8 53 53 15 15 15 37 36 37 35 1 1 4 2 7 1 1 9 13 9 14 3 17 3 6 30 20 21 1 3 8 2 24 1 1 22 8 9 11 1 10 11 5 4 1 15 15 11 11 4 1 11 11 11 21 4 24 4 15 35 12 5 11 6 8 21 21 21 44 7 41 3 20 22 44 44 4 2 2 2 22 22 22 22 4 4 4 4 41 5 41 41 5 41 40 48 1 40 18 3 21 12 15 21 13 15 15 21 6 6 6 6 6 4 3 5 1 6 15 11 1 3 1 3 1 6 9 9 8 8 3 4 19 19 6 8 13 7 6 22 21 2 21 22 22 11 3 9 42 41 3 1 2 2 16 18 3 3 3 3 3 3 2 2 2 1 2 20 1 20 16 8 10 2 8 7 7 2 4 1 15 16 16 9 8 2 3 9 1 9 9 8 4 4 4 4 3 3 1 2 2 17 14 4 2 2 2 2 1 1 2 4 17 16 16 16 8 16 12 12 2 2 2 1 1 12 11 1 1 1 1 16 14 1 3 16 1 14 1 1 16 36 1 35 36 44 44 44 3 1 1 38 36 1 14 35 35 34 1 1 34 34 33 1 33 33 1 32 1 25 22 2 2 14 9 8 2 2 2 2 3 11 9 1 13 4 15 16 16 17 18 18 26 31 31 20 20 31 30 82 5 84 1 84 71 8 1 14 18 11 3 7 18 2 2 1 1 1 1 2 34 1 12 1 8 3 1 1 4 13 5 2 7 1 1 1 1 1 1 6 5 2 2 19 2 7 7 7 4 11 2 22 22 11 11 2 19 10 3 12 3 12 3 11 3 5 2 3 1 2 1 2 2 1 2 2 2 1 2 2 2 1 1 1 1 3 1 1 2 4 4 1 2 3 1 1 1 1 1 13 1 9 9 9 73 91 2 22 134 134 90 71 71 9 9 2 7 2 7 9 15 15 15 15 89 1 5 87 87 78 9 59 24 21 20 2 9 14 4 2 1 2 2 7 3 1 3 1 3 3 1 1 1 2 127 126 127 4 89 1 2 9 1 7 1 1 1 1 1 1 2 88 126 86 45 126 2 2 5 1 4 3 3 3 3 3 1 2 2 2 2 3 6 6 7 7 7 6 7 3 3 3 2 3 6 3 1 1 10 2 10 7 5 5 4 1 4 1 5 11 11 10 11 7 5 5 5 11 5 5 6 11 11 9 2 8 8 8 8 8 8 7 8 8 6 1 6 6 6 8 8 1 1 8 7 8 6 7 7 1 1 1 1 1 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 | // SPDX-License-Identifier: GPL-2.0-only /* binder.c * * Android IPC Subsystem * * Copyright (C) 2007-2008 Google, Inc. */ /* * Locking overview * * There are 3 main spinlocks which must be acquired in the * order shown: * * 1) proc->outer_lock : protects binder_ref * binder_proc_lock() and binder_proc_unlock() are * used to acq/rel. * 2) node->lock : protects most fields of binder_node. * binder_node_lock() and binder_node_unlock() are * used to acq/rel * 3) proc->inner_lock : protects the thread and node lists * (proc->threads, proc->waiting_threads, proc->nodes) * and all todo lists associated with the binder_proc * (proc->todo, thread->todo, proc->delivered_death and * node->async_todo), as well as thread->transaction_stack * binder_inner_proc_lock() and binder_inner_proc_unlock() * are used to acq/rel * * Any lock under procA must never be nested under any lock at the same * level or below on procB. * * Functions that require a lock held on entry indicate which lock * in the suffix of the function name: * * foo_olocked() : requires node->outer_lock * foo_nlocked() : requires node->lock * foo_ilocked() : requires proc->inner_lock * foo_oilocked(): requires proc->outer_lock and proc->inner_lock * foo_nilocked(): requires node->lock and proc->inner_lock * ... */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/fdtable.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/list.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/nsproxy.h> #include <linux/poll.h> #include <linux/debugfs.h> #include <linux/rbtree.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/pid_namespace.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/ratelimit.h> #include <linux/syscalls.h> #include <linux/task_work.h> #include <linux/sizes.h> #include <linux/ktime.h> #include <uapi/linux/android/binder.h> #include <linux/cacheflush.h> #include "binder_internal.h" #include "binder_trace.h" static HLIST_HEAD(binder_deferred_list); static DEFINE_MUTEX(binder_deferred_lock); static HLIST_HEAD(binder_devices); static HLIST_HEAD(binder_procs); static DEFINE_MUTEX(binder_procs_lock); static HLIST_HEAD(binder_dead_nodes); static DEFINE_SPINLOCK(binder_dead_nodes_lock); static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; static atomic_t binder_last_id; static int proc_show(struct seq_file *m, void *unused); DEFINE_SHOW_ATTRIBUTE(proc); #define FORBIDDEN_MMAP_FLAGS (VM_WRITE) enum { BINDER_DEBUG_USER_ERROR = 1U << 0, BINDER_DEBUG_FAILED_TRANSACTION = 1U << 1, BINDER_DEBUG_DEAD_TRANSACTION = 1U << 2, BINDER_DEBUG_OPEN_CLOSE = 1U << 3, BINDER_DEBUG_DEAD_BINDER = 1U << 4, BINDER_DEBUG_DEATH_NOTIFICATION = 1U << 5, BINDER_DEBUG_READ_WRITE = 1U << 6, BINDER_DEBUG_USER_REFS = 1U << 7, BINDER_DEBUG_THREADS = 1U << 8, BINDER_DEBUG_TRANSACTION = 1U << 9, BINDER_DEBUG_TRANSACTION_COMPLETE = 1U << 10, BINDER_DEBUG_FREE_BUFFER = 1U << 11, BINDER_DEBUG_INTERNAL_REFS = 1U << 12, BINDER_DEBUG_PRIORITY_CAP = 1U << 13, BINDER_DEBUG_SPINLOCKS = 1U << 14, }; static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; module_param_named(debug_mask, binder_debug_mask, uint, 0644); char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; module_param_named(devices, binder_devices_param, charp, 0444); static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); static int binder_stop_on_user_error; static int binder_set_stop_on_user_error(const char *val, const struct kernel_param *kp) { int ret; ret = param_set_int(val, kp); if (binder_stop_on_user_error < 2) wake_up(&binder_user_error_wait); return ret; } module_param_call(stop_on_user_error, binder_set_stop_on_user_error, param_get_int, &binder_stop_on_user_error, 0644); static __printf(2, 3) void binder_debug(int mask, const char *format, ...) { struct va_format vaf; va_list args; if (binder_debug_mask & mask) { va_start(args, format); vaf.va = &args; vaf.fmt = format; pr_info_ratelimited("%pV", &vaf); va_end(args); } } #define binder_txn_error(x...) \ binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, x) static __printf(1, 2) void binder_user_error(const char *format, ...) { struct va_format vaf; va_list args; if (binder_debug_mask & BINDER_DEBUG_USER_ERROR) { va_start(args, format); vaf.va = &args; vaf.fmt = format; pr_info_ratelimited("%pV", &vaf); va_end(args); } if (binder_stop_on_user_error) binder_stop_on_user_error = 2; } #define binder_set_extended_error(ee, _id, _command, _param) \ do { \ (ee)->id = _id; \ (ee)->command = _command; \ (ee)->param = _param; \ } while (0) #define to_flat_binder_object(hdr) \ container_of(hdr, struct flat_binder_object, hdr) #define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr) #define to_binder_buffer_object(hdr) \ container_of(hdr, struct binder_buffer_object, hdr) #define to_binder_fd_array_object(hdr) \ container_of(hdr, struct binder_fd_array_object, hdr) static struct binder_stats binder_stats; static inline void binder_stats_deleted(enum binder_stat_types type) { atomic_inc(&binder_stats.obj_deleted[type]); } static inline void binder_stats_created(enum binder_stat_types type) { atomic_inc(&binder_stats.obj_created[type]); } struct binder_transaction_log_entry { int debug_id; int debug_id_done; int call_type; int from_proc; int from_thread; int target_handle; int to_proc; int to_thread; int to_node; int data_size; int offsets_size; int return_error_line; uint32_t return_error; uint32_t return_error_param; char context_name[BINDERFS_MAX_NAME + 1]; }; struct binder_transaction_log { atomic_t cur; bool full; struct binder_transaction_log_entry entry[32]; }; static struct binder_transaction_log binder_transaction_log; static struct binder_transaction_log binder_transaction_log_failed; static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_transaction_log *log) { struct binder_transaction_log_entry *e; unsigned int cur = atomic_inc_return(&log->cur); if (cur >= ARRAY_SIZE(log->entry)) log->full = true; e = &log->entry[cur % ARRAY_SIZE(log->entry)]; WRITE_ONCE(e->debug_id_done, 0); /* * write-barrier to synchronize access to e->debug_id_done. * We make sure the initialized 0 value is seen before * memset() other fields are zeroed by memset. */ smp_wmb(); memset(e, 0, sizeof(*e)); return e; } enum binder_deferred_state { BINDER_DEFERRED_FLUSH = 0x01, BINDER_DEFERRED_RELEASE = 0x02, }; enum { BINDER_LOOPER_STATE_REGISTERED = 0x01, BINDER_LOOPER_STATE_ENTERED = 0x02, BINDER_LOOPER_STATE_EXITED = 0x04, BINDER_LOOPER_STATE_INVALID = 0x08, BINDER_LOOPER_STATE_WAITING = 0x10, BINDER_LOOPER_STATE_POLL = 0x20, }; /** * binder_proc_lock() - Acquire outer lock for given binder_proc * @proc: struct binder_proc to acquire * * Acquires proc->outer_lock. Used to protect binder_ref * structures associated with the given proc. */ #define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__) static void _binder_proc_lock(struct binder_proc *proc, int line) __acquires(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&proc->outer_lock); } /** * binder_proc_unlock() - Release spinlock for given binder_proc * @proc: struct binder_proc to acquire * * Release lock acquired via binder_proc_lock() */ #define binder_proc_unlock(proc) _binder_proc_unlock(proc, __LINE__) static void _binder_proc_unlock(struct binder_proc *proc, int line) __releases(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&proc->outer_lock); } /** * binder_inner_proc_lock() - Acquire inner lock for given binder_proc * @proc: struct binder_proc to acquire * * Acquires proc->inner_lock. Used to protect todo lists */ #define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__) static void _binder_inner_proc_lock(struct binder_proc *proc, int line) __acquires(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&proc->inner_lock); } /** * binder_inner_proc_unlock() - Release inner lock for given binder_proc * @proc: struct binder_proc to acquire * * Release lock acquired via binder_inner_proc_lock() */ #define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__) static void _binder_inner_proc_unlock(struct binder_proc *proc, int line) __releases(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&proc->inner_lock); } /** * binder_node_lock() - Acquire spinlock for given binder_node * @node: struct binder_node to acquire * * Acquires node->lock. Used to protect binder_node fields */ #define binder_node_lock(node) _binder_node_lock(node, __LINE__) static void _binder_node_lock(struct binder_node *node, int line) __acquires(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&node->lock); } /** * binder_node_unlock() - Release spinlock for given binder_proc * @node: struct binder_node to acquire * * Release lock acquired via binder_node_lock() */ #define binder_node_unlock(node) _binder_node_unlock(node, __LINE__) static void _binder_node_unlock(struct binder_node *node, int line) __releases(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&node->lock); } /** * binder_node_inner_lock() - Acquire node and inner locks * @node: struct binder_node to acquire * * Acquires node->lock. If node->proc also acquires * proc->inner_lock. Used to protect binder_node fields */ #define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__) static void _binder_node_inner_lock(struct binder_node *node, int line) __acquires(&node->lock) __acquires(&node->proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&node->lock); if (node->proc) binder_inner_proc_lock(node->proc); else /* annotation for sparse */ __acquire(&node->proc->inner_lock); } /** * binder_node_inner_unlock() - Release node and inner locks * @node: struct binder_node to acquire * * Release lock acquired via binder_node_lock() */ #define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__) static void _binder_node_inner_unlock(struct binder_node *node, int line) __releases(&node->lock) __releases(&node->proc->inner_lock) { struct binder_proc *proc = node->proc; binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); if (proc) binder_inner_proc_unlock(proc); else /* annotation for sparse */ __release(&node->proc->inner_lock); spin_unlock(&node->lock); } static bool binder_worklist_empty_ilocked(struct list_head *list) { return list_empty(list); } /** * binder_worklist_empty() - Check if no items on the work list * @proc: binder_proc associated with list * @list: list to check * * Return: true if there are no items on list, else false */ static bool binder_worklist_empty(struct binder_proc *proc, struct list_head *list) { bool ret; binder_inner_proc_lock(proc); ret = binder_worklist_empty_ilocked(list); binder_inner_proc_unlock(proc); return ret; } /** * binder_enqueue_work_ilocked() - Add an item to the work list * @work: struct binder_work to add to list * @target_list: list to add work to * * Adds the work to the specified list. Asserts that work * is not already on a list. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_work_ilocked(struct binder_work *work, struct list_head *target_list) { BUG_ON(target_list == NULL); BUG_ON(work->entry.next && !list_empty(&work->entry)); list_add_tail(&work->entry, target_list); } /** * binder_enqueue_deferred_thread_work_ilocked() - Add deferred thread work * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread. Doesn't set the process_todo * flag, which means that (if it wasn't already set) the thread will go to * sleep without handling this work when it calls read. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_deferred_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); } /** * binder_enqueue_thread_work_ilocked() - Add an item to the thread work list * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread, and enables processing * of the todo queue. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); /* (e)poll-based threads require an explicit wakeup signal when * queuing their own work; they rely on these events to consume * messages without I/O block. Without it, threads risk waiting * indefinitely without handling the work. */ if (thread->looper & BINDER_LOOPER_STATE_POLL && thread->pid == current->pid && !thread->process_todo) wake_up_interruptible_sync(&thread->wait); thread->process_todo = true; } /** * binder_enqueue_thread_work() - Add an item to the thread work list * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread, and enables processing * of the todo queue. */ static void binder_enqueue_thread_work(struct binder_thread *thread, struct binder_work *work) { binder_inner_proc_lock(thread->proc); binder_enqueue_thread_work_ilocked(thread, work); binder_inner_proc_unlock(thread->proc); } static void binder_dequeue_work_ilocked(struct binder_work *work) { list_del_init(&work->entry); } /** * binder_dequeue_work() - Removes an item from the work list * @proc: binder_proc associated with list * @work: struct binder_work to remove from list * * Removes the specified work item from whatever list it is on. * Can safely be called if work is not on any list. */ static void binder_dequeue_work(struct binder_proc *proc, struct binder_work *work) { binder_inner_proc_lock(proc); binder_dequeue_work_ilocked(work); binder_inner_proc_unlock(proc); } static struct binder_work *binder_dequeue_work_head_ilocked( struct list_head *list) { struct binder_work *w; w = list_first_entry_or_null(list, struct binder_work, entry); if (w) list_del_init(&w->entry); return w; } static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); static void binder_free_thread(struct binder_thread *thread); static void binder_free_proc(struct binder_proc *proc); static void binder_inc_node_tmpref_ilocked(struct binder_node *node); static bool binder_has_work_ilocked(struct binder_thread *thread, bool do_proc_work) { return thread->process_todo || thread->looper_need_return || (do_proc_work && !binder_worklist_empty_ilocked(&thread->proc->todo)); } static bool binder_has_work(struct binder_thread *thread, bool do_proc_work) { bool has_work; binder_inner_proc_lock(thread->proc); has_work = binder_has_work_ilocked(thread, do_proc_work); binder_inner_proc_unlock(thread->proc); return has_work; } static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread) { return !thread->transaction_stack && binder_worklist_empty_ilocked(&thread->todo) && (thread->looper & (BINDER_LOOPER_STATE_ENTERED | BINDER_LOOPER_STATE_REGISTERED)); } static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc, bool sync) { struct rb_node *n; struct binder_thread *thread; for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { thread = rb_entry(n, struct binder_thread, rb_node); if (thread->looper & BINDER_LOOPER_STATE_POLL && binder_available_for_proc_work_ilocked(thread)) { if (sync) wake_up_interruptible_sync(&thread->wait); else wake_up_interruptible(&thread->wait); } } } /** * binder_select_thread_ilocked() - selects a thread for doing proc work. * @proc: process to select a thread from * * Note that calling this function moves the thread off the waiting_threads * list, so it can only be woken up by the caller of this function, or a * signal. Therefore, callers *should* always wake up the thread this function * returns. * * Return: If there's a thread currently waiting for process work, * returns that thread. Otherwise returns NULL. */ static struct binder_thread * binder_select_thread_ilocked(struct binder_proc *proc) { struct binder_thread *thread; assert_spin_locked(&proc->inner_lock); thread = list_first_entry_or_null(&proc->waiting_threads, struct binder_thread, waiting_thread_node); if (thread) list_del_init(&thread->waiting_thread_node); return thread; } /** * binder_wakeup_thread_ilocked() - wakes up a thread for doing proc work. * @proc: process to wake up a thread in * @thread: specific thread to wake-up (may be NULL) * @sync: whether to do a synchronous wake-up * * This function wakes up a thread in the @proc process. * The caller may provide a specific thread to wake-up in * the @thread parameter. If @thread is NULL, this function * will wake up threads that have called poll(). * * Note that for this function to work as expected, callers * should first call binder_select_thread() to find a thread * to handle the work (if they don't have a thread already), * and pass the result into the @thread parameter. */ static void binder_wakeup_thread_ilocked(struct binder_proc *proc, struct binder_thread *thread, bool sync) { assert_spin_locked(&proc->inner_lock); if (thread) { if (sync) wake_up_interruptible_sync(&thread->wait); else wake_up_interruptible(&thread->wait); return; } /* Didn't find a thread waiting for proc work; this can happen * in two scenarios: * 1. All threads are busy handling transactions * In that case, one of those threads should call back into * the kernel driver soon and pick up this work. * 2. Threads are using the (e)poll interface, in which case * they may be blocked on the waitqueue without having been * added to waiting_threads. For this case, we just iterate * over all threads not handling transaction work, and * wake them all up. We wake all because we don't know whether * a thread that called into (e)poll is handling non-binder * work currently. */ binder_wakeup_poll_threads_ilocked(proc, sync); } static void binder_wakeup_proc_ilocked(struct binder_proc *proc) { struct binder_thread *thread = binder_select_thread_ilocked(proc); binder_wakeup_thread_ilocked(proc, thread, /* sync = */false); } static void binder_set_nice(long nice) { long min_nice; if (can_nice(current, nice)) { set_user_nice(current, nice); return; } min_nice = rlimit_to_nice(rlimit(RLIMIT_NICE)); binder_debug(BINDER_DEBUG_PRIORITY_CAP, "%d: nice value %ld not allowed use %ld instead\n", current->pid, nice, min_nice); set_user_nice(current, min_nice); if (min_nice <= MAX_NICE) return; binder_user_error("%d RLIMIT_NICE not set\n", current->pid); } static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc, binder_uintptr_t ptr) { struct rb_node *n = proc->nodes.rb_node; struct binder_node *node; assert_spin_locked(&proc->inner_lock); while (n) { node = rb_entry(n, struct binder_node, rb_node); if (ptr < node->ptr) n = n->rb_left; else if (ptr > node->ptr) n = n->rb_right; else { /* * take an implicit weak reference * to ensure node stays alive until * call to binder_put_node() */ binder_inc_node_tmpref_ilocked(node); return node; } } return NULL; } static struct binder_node *binder_get_node(struct binder_proc *proc, binder_uintptr_t ptr) { struct binder_node *node; binder_inner_proc_lock(proc); node = binder_get_node_ilocked(proc, ptr); binder_inner_proc_unlock(proc); return node; } static struct binder_node *binder_init_node_ilocked( struct binder_proc *proc, struct binder_node *new_node, struct flat_binder_object *fp) { struct rb_node **p = &proc->nodes.rb_node; struct rb_node *parent = NULL; struct binder_node *node; binder_uintptr_t ptr = fp ? fp->binder : 0; binder_uintptr_t cookie = fp ? fp->cookie : 0; __u32 flags = fp ? fp->flags : 0; assert_spin_locked(&proc->inner_lock); while (*p) { parent = *p; node = rb_entry(parent, struct binder_node, rb_node); if (ptr < node->ptr) p = &(*p)->rb_left; else if (ptr > node->ptr) p = &(*p)->rb_right; else { /* * A matching node is already in * the rb tree. Abandon the init * and return it. */ binder_inc_node_tmpref_ilocked(node); return node; } } node = new_node; binder_stats_created(BINDER_STAT_NODE); node->tmp_refs++; rb_link_node(&node->rb_node, parent, p); rb_insert_color(&node->rb_node, &proc->nodes); node->debug_id = atomic_inc_return(&binder_last_id); node->proc = proc; node->ptr = ptr; node->cookie = cookie; node->work.type = BINDER_WORK_NODE; node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); node->txn_security_ctx = !!(flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX); spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); INIT_LIST_HEAD(&node->async_todo); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx created\n", proc->pid, current->pid, node->debug_id, (u64)node->ptr, (u64)node->cookie); return node; } static struct binder_node *binder_new_node(struct binder_proc *proc, struct flat_binder_object *fp) { struct binder_node *node; struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL); if (!new_node) return NULL; binder_inner_proc_lock(proc); node = binder_init_node_ilocked(proc, new_node, fp); binder_inner_proc_unlock(proc); if (node != new_node) /* * The node was already added by another thread */ kfree(new_node); return node; } static void binder_free_node(struct binder_node *node) { kfree(node); binder_stats_deleted(BINDER_STAT_NODE); } static int binder_inc_node_nilocked(struct binder_node *node, int strong, int internal, struct list_head *target_list) { struct binder_proc *proc = node->proc; assert_spin_locked(&node->lock); if (proc) assert_spin_locked(&proc->inner_lock); if (strong) { if (internal) { if (target_list == NULL && node->internal_strong_refs == 0 && !(node->proc && node == node->proc->context->binder_context_mgr_node && node->has_strong_ref)) { pr_err("invalid inc strong node for %d\n", node->debug_id); return -EINVAL; } node->internal_strong_refs++; } else node->local_strong_refs++; if (!node->has_strong_ref && target_list) { struct binder_thread *thread = container_of(target_list, struct binder_thread, todo); binder_dequeue_work_ilocked(&node->work); BUG_ON(&thread->todo != target_list); binder_enqueue_deferred_thread_work_ilocked(thread, &node->work); } } else { if (!internal) node->local_weak_refs++; if (!node->has_weak_ref && list_empty(&node->work.entry)) { if (target_list == NULL) { pr_err("invalid inc weak node for %d\n", node->debug_id); return -EINVAL; } /* * See comment above */ binder_enqueue_work_ilocked(&node->work, target_list); } } return 0; } static int binder_inc_node(struct binder_node *node, int strong, int internal, struct list_head *target_list) { int ret; binder_node_inner_lock(node); ret = binder_inc_node_nilocked(node, strong, internal, target_list); binder_node_inner_unlock(node); return ret; } static bool binder_dec_node_nilocked(struct binder_node *node, int strong, int internal) { struct binder_proc *proc = node->proc; assert_spin_locked(&node->lock); if (proc) assert_spin_locked(&proc->inner_lock); if (strong) { if (internal) node->internal_strong_refs--; else node->local_strong_refs--; if (node->local_strong_refs || node->internal_strong_refs) return false; } else { if (!internal) node->local_weak_refs--; if (node->local_weak_refs || node->tmp_refs || !hlist_empty(&node->refs)) return false; } if (proc && (node->has_strong_ref || node->has_weak_ref)) { if (list_empty(&node->work.entry)) { binder_enqueue_work_ilocked(&node->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } } else { if (hlist_empty(&node->refs) && !node->local_strong_refs && !node->local_weak_refs && !node->tmp_refs) { if (proc) { binder_dequeue_work_ilocked(&node->work); rb_erase(&node->rb_node, &proc->nodes); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "refless node %d deleted\n", node->debug_id); } else { BUG_ON(!list_empty(&node->work.entry)); spin_lock(&binder_dead_nodes_lock); /* * tmp_refs could have changed so * check it again */ if (node->tmp_refs) { spin_unlock(&binder_dead_nodes_lock); return false; } hlist_del(&node->dead_node); spin_unlock(&binder_dead_nodes_lock); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "dead node %d deleted\n", node->debug_id); } return true; } } return false; } static void binder_dec_node(struct binder_node *node, int strong, int internal) { bool free_node; binder_node_inner_lock(node); free_node = binder_dec_node_nilocked(node, strong, internal); binder_node_inner_unlock(node); if (free_node) binder_free_node(node); } static void binder_inc_node_tmpref_ilocked(struct binder_node *node) { /* * No call to binder_inc_node() is needed since we * don't need to inform userspace of any changes to * tmp_refs */ node->tmp_refs++; } /** * binder_inc_node_tmpref() - take a temporary reference on node * @node: node to reference * * Take reference on node to prevent the node from being freed * while referenced only by a local variable. The inner lock is * needed to serialize with the node work on the queue (which * isn't needed after the node is dead). If the node is dead * (node->proc is NULL), use binder_dead_nodes_lock to protect * node->tmp_refs against dead-node-only cases where the node * lock cannot be acquired (eg traversing the dead node list to * print nodes) */ static void binder_inc_node_tmpref(struct binder_node *node) { binder_node_lock(node); if (node->proc) binder_inner_proc_lock(node->proc); else spin_lock(&binder_dead_nodes_lock); binder_inc_node_tmpref_ilocked(node); if (node->proc) binder_inner_proc_unlock(node->proc); else spin_unlock(&binder_dead_nodes_lock); binder_node_unlock(node); } /** * binder_dec_node_tmpref() - remove a temporary reference on node * @node: node to reference * * Release temporary reference on node taken via binder_inc_node_tmpref() */ static void binder_dec_node_tmpref(struct binder_node *node) { bool free_node; binder_node_inner_lock(node); if (!node->proc) spin_lock(&binder_dead_nodes_lock); else __acquire(&binder_dead_nodes_lock); node->tmp_refs--; BUG_ON(node->tmp_refs < 0); if (!node->proc) spin_unlock(&binder_dead_nodes_lock); else __release(&binder_dead_nodes_lock); /* * Call binder_dec_node() to check if all refcounts are 0 * and cleanup is needed. Calling with strong=0 and internal=1 * causes no actual reference to be released in binder_dec_node(). * If that changes, a change is needed here too. */ free_node = binder_dec_node_nilocked(node, 0, 1); binder_node_inner_unlock(node); if (free_node) binder_free_node(node); } static void binder_put_node(struct binder_node *node) { binder_dec_node_tmpref(node); } static struct binder_ref *binder_get_ref_olocked(struct binder_proc *proc, u32 desc, bool need_strong_ref) { struct rb_node *n = proc->refs_by_desc.rb_node; struct binder_ref *ref; while (n) { ref = rb_entry(n, struct binder_ref, rb_node_desc); if (desc < ref->data.desc) { n = n->rb_left; } else if (desc > ref->data.desc) { n = n->rb_right; } else if (need_strong_ref && !ref->data.strong) { binder_user_error("tried to use weak ref as strong ref\n"); return NULL; } else { return ref; } } return NULL; } /** * binder_get_ref_for_node_olocked() - get the ref associated with given node * @proc: binder_proc that owns the ref * @node: binder_node of target * @new_ref: newly allocated binder_ref to be initialized or %NULL * * Look up the ref for the given node and return it if it exists * * If it doesn't exist and the caller provides a newly allocated * ref, initialize the fields of the newly allocated ref and insert * into the given proc rb_trees and node refs list. * * Return: the ref for node. It is possible that another thread * allocated/initialized the ref first in which case the * returned ref would be different than the passed-in * new_ref. new_ref must be kfree'd by the caller in * this case. */ static struct binder_ref *binder_get_ref_for_node_olocked( struct binder_proc *proc, struct binder_node *node, struct binder_ref *new_ref) { struct binder_context *context = proc->context; struct rb_node **p = &proc->refs_by_node.rb_node; struct rb_node *parent = NULL; struct binder_ref *ref; struct rb_node *n; while (*p) { parent = *p; ref = rb_entry(parent, struct binder_ref, rb_node_node); if (node < ref->node) p = &(*p)->rb_left; else if (node > ref->node) p = &(*p)->rb_right; else return ref; } if (!new_ref) return NULL; binder_stats_created(BINDER_STAT_REF); new_ref->data.debug_id = atomic_inc_return(&binder_last_id); new_ref->proc = proc; new_ref->node = node; rb_link_node(&new_ref->rb_node_node, parent, p); rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node); new_ref->data.desc = (node == context->binder_context_mgr_node) ? 0 : 1; for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { ref = rb_entry(n, struct binder_ref, rb_node_desc); if (ref->data.desc > new_ref->data.desc) break; new_ref->data.desc = ref->data.desc + 1; } p = &proc->refs_by_desc.rb_node; while (*p) { parent = *p; ref = rb_entry(parent, struct binder_ref, rb_node_desc); if (new_ref->data.desc < ref->data.desc) p = &(*p)->rb_left; else if (new_ref->data.desc > ref->data.desc) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new_ref->rb_node_desc, parent, p); rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc); binder_node_lock(node); hlist_add_head(&new_ref->node_entry, &node->refs); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d new ref %d desc %d for node %d\n", proc->pid, new_ref->data.debug_id, new_ref->data.desc, node->debug_id); binder_node_unlock(node); return new_ref; } static void binder_cleanup_ref_olocked(struct binder_ref *ref) { bool delete_node = false; binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d delete ref %d desc %d for node %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->node->debug_id); rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc); rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node); binder_node_inner_lock(ref->node); if (ref->data.strong) binder_dec_node_nilocked(ref->node, 1, 1); hlist_del(&ref->node_entry); delete_node = binder_dec_node_nilocked(ref->node, 0, 1); binder_node_inner_unlock(ref->node); /* * Clear ref->node unless we want the caller to free the node */ if (!delete_node) { /* * The caller uses ref->node to determine * whether the node needs to be freed. Clear * it since the node is still alive. */ ref->node = NULL; } if (ref->death) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "%d delete ref %d desc %d has death notification\n", ref->proc->pid, ref->data.debug_id, ref->data.desc); binder_dequeue_work(ref->proc, &ref->death->work); binder_stats_deleted(BINDER_STAT_DEATH); } binder_stats_deleted(BINDER_STAT_REF); } /** * binder_inc_ref_olocked() - increment the ref for given handle * @ref: ref to be incremented * @strong: if true, strong increment, else weak * @target_list: list to queue node work on * * Increment the ref. @ref->proc->outer_lock must be held on entry * * Return: 0, if successful, else errno */ static int binder_inc_ref_olocked(struct binder_ref *ref, int strong, struct list_head *target_list) { int ret; if (strong) { if (ref->data.strong == 0) { ret = binder_inc_node(ref->node, 1, 1, target_list); if (ret) return ret; } ref->data.strong++; } else { if (ref->data.weak == 0) { ret = binder_inc_node(ref->node, 0, 1, target_list); if (ret) return ret; } ref->data.weak++; } return 0; } /** * binder_dec_ref_olocked() - dec the ref for given handle * @ref: ref to be decremented * @strong: if true, strong decrement, else weak * * Decrement the ref. * * Return: %true if ref is cleaned up and ready to be freed. */ static bool binder_dec_ref_olocked(struct binder_ref *ref, int strong) { if (strong) { if (ref->data.strong == 0) { binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak); return false; } ref->data.strong--; if (ref->data.strong == 0) binder_dec_node(ref->node, strong, 1); } else { if (ref->data.weak == 0) { binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak); return false; } ref->data.weak--; } if (ref->data.strong == 0 && ref->data.weak == 0) { binder_cleanup_ref_olocked(ref); return true; } return false; } /** * binder_get_node_from_ref() - get the node from the given proc/desc * @proc: proc containing the ref * @desc: the handle associated with the ref * @need_strong_ref: if true, only return node if ref is strong * @rdata: the id/refcount data for the ref * * Given a proc and ref handle, return the associated binder_node * * Return: a binder_node or NULL if not found or not strong when strong required */ static struct binder_node *binder_get_node_from_ref( struct binder_proc *proc, u32 desc, bool need_strong_ref, struct binder_ref_data *rdata) { struct binder_node *node; struct binder_ref *ref; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, desc, need_strong_ref); if (!ref) goto err_no_ref; node = ref->node; /* * Take an implicit reference on the node to ensure * it stays alive until the call to binder_put_node() */ binder_inc_node_tmpref(node); if (rdata) *rdata = ref->data; binder_proc_unlock(proc); return node; err_no_ref: binder_proc_unlock(proc); return NULL; } /** * binder_free_ref() - free the binder_ref * @ref: ref to free * * Free the binder_ref. Free the binder_node indicated by ref->node * (if non-NULL) and the binder_ref_death indicated by ref->death. */ static void binder_free_ref(struct binder_ref *ref) { if (ref->node) binder_free_node(ref->node); kfree(ref->death); kfree(ref); } /** * binder_update_ref_for_handle() - inc/dec the ref for given handle * @proc: proc containing the ref * @desc: the handle associated with the ref * @increment: true=inc reference, false=dec reference * @strong: true=strong reference, false=weak reference * @rdata: the id/refcount data for the ref * * Given a proc and ref handle, increment or decrement the ref * according to "increment" arg. * * Return: 0 if successful, else errno */ static int binder_update_ref_for_handle(struct binder_proc *proc, uint32_t desc, bool increment, bool strong, struct binder_ref_data *rdata) { int ret = 0; struct binder_ref *ref; bool delete_ref = false; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, desc, strong); if (!ref) { ret = -EINVAL; goto err_no_ref; } if (increment) ret = binder_inc_ref_olocked(ref, strong, NULL); else delete_ref = binder_dec_ref_olocked(ref, strong); if (rdata) *rdata = ref->data; binder_proc_unlock(proc); if (delete_ref) binder_free_ref(ref); return ret; err_no_ref: binder_proc_unlock(proc); return ret; } /** * binder_dec_ref_for_handle() - dec the ref for given handle * @proc: proc containing the ref * @desc: the handle associated with the ref * @strong: true=strong reference, false=weak reference * @rdata: the id/refcount data for the ref * * Just calls binder_update_ref_for_handle() to decrement the ref. * * Return: 0 if successful, else errno */ static int binder_dec_ref_for_handle(struct binder_proc *proc, uint32_t desc, bool strong, struct binder_ref_data *rdata) { return binder_update_ref_for_handle(proc, desc, false, strong, rdata); } /** * binder_inc_ref_for_node() - increment the ref for given proc/node * @proc: proc containing the ref * @node: target node * @strong: true=strong reference, false=weak reference * @target_list: worklist to use if node is incremented * @rdata: the id/refcount data for the ref * * Given a proc and node, increment the ref. Create the ref if it * doesn't already exist * * Return: 0 if successful, else errno */ static int binder_inc_ref_for_node(struct binder_proc *proc, struct binder_node *node, bool strong, struct list_head *target_list, struct binder_ref_data *rdata) { struct binder_ref *ref; struct binder_ref *new_ref = NULL; int ret = 0; binder_proc_lock(proc); ref = binder_get_ref_for_node_olocked(proc, node, NULL); if (!ref) { binder_proc_unlock(proc); new_ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!new_ref) return -ENOMEM; binder_proc_lock(proc); ref = binder_get_ref_for_node_olocked(proc, node, new_ref); } ret = binder_inc_ref_olocked(ref, strong, target_list); *rdata = ref->data; if (ret && ref == new_ref) { /* * Cleanup the failed reference here as the target * could now be dead and have already released its * references by now. Calling on the new reference * with strong=0 and a tmp_refs will not decrement * the node. The new_ref gets kfree'd below. */ binder_cleanup_ref_olocked(new_ref); ref = NULL; } binder_proc_unlock(proc); if (new_ref && ref != new_ref) /* * Another thread created the ref first so * free the one we allocated */ kfree(new_ref); return ret; } static void binder_pop_transaction_ilocked(struct binder_thread *target_thread, struct binder_transaction *t) { BUG_ON(!target_thread); assert_spin_locked(&target_thread->proc->inner_lock); BUG_ON(target_thread->transaction_stack != t); BUG_ON(target_thread->transaction_stack->from != target_thread); target_thread->transaction_stack = target_thread->transaction_stack->from_parent; t->from = NULL; } /** * binder_thread_dec_tmpref() - decrement thread->tmp_ref * @thread: thread to decrement * * A thread needs to be kept alive while being used to create or * handle a transaction. binder_get_txn_from() is used to safely * extract t->from from a binder_transaction and keep the thread * indicated by t->from from being freed. When done with that * binder_thread, this function is called to decrement the * tmp_ref and free if appropriate (thread has been released * and no transaction being processed by the driver) */ static void binder_thread_dec_tmpref(struct binder_thread *thread) { /* * atomic is used to protect the counter value while * it cannot reach zero or thread->is_dead is false */ binder_inner_proc_lock(thread->proc); atomic_dec(&thread->tmp_ref); if (thread->is_dead && !atomic_read(&thread->tmp_ref)) { binder_inner_proc_unlock(thread->proc); binder_free_thread(thread); return; } binder_inner_proc_unlock(thread->proc); } /** * binder_proc_dec_tmpref() - decrement proc->tmp_ref * @proc: proc to decrement * * A binder_proc needs to be kept alive while being used to create or * handle a transaction. proc->tmp_ref is incremented when * creating a new transaction or the binder_proc is currently in-use * by threads that are being released. When done with the binder_proc, * this function is called to decrement the counter and free the * proc if appropriate (proc has been released, all threads have * been released and not currenly in-use to process a transaction). */ static void binder_proc_dec_tmpref(struct binder_proc *proc) { binder_inner_proc_lock(proc); proc->tmp_ref--; if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) && !proc->tmp_ref) { binder_inner_proc_unlock(proc); binder_free_proc(proc); return; } binder_inner_proc_unlock(proc); } /** * binder_get_txn_from() - safely extract the "from" thread in transaction * @t: binder transaction for t->from * * Atomically return the "from" thread and increment the tmp_ref * count for the thread to ensure it stays alive until * binder_thread_dec_tmpref() is called. * * Return: the value of t->from */ static struct binder_thread *binder_get_txn_from( struct binder_transaction *t) { struct binder_thread *from; spin_lock(&t->lock); from = t->from; if (from) atomic_inc(&from->tmp_ref); spin_unlock(&t->lock); return from; } /** * binder_get_txn_from_and_acq_inner() - get t->from and acquire inner lock * @t: binder transaction for t->from * * Same as binder_get_txn_from() except it also acquires the proc->inner_lock * to guarantee that the thread cannot be released while operating on it. * The caller must call binder_inner_proc_unlock() to release the inner lock * as well as call binder_dec_thread_txn() to release the reference. * * Return: the value of t->from */ static struct binder_thread *binder_get_txn_from_and_acq_inner( struct binder_transaction *t) __acquires(&t->from->proc->inner_lock) { struct binder_thread *from; from = binder_get_txn_from(t); if (!from) { __acquire(&from->proc->inner_lock); return NULL; } binder_inner_proc_lock(from->proc); if (t->from) { BUG_ON(from != t->from); return from; } binder_inner_proc_unlock(from->proc); __acquire(&from->proc->inner_lock); binder_thread_dec_tmpref(from); return NULL; } /** * binder_free_txn_fixups() - free unprocessed fd fixups * @t: binder transaction for t->from * * If the transaction is being torn down prior to being * processed by the target process, free all of the * fd fixups and fput the file structs. It is safe to * call this function after the fixups have been * processed -- in that case, the list will be empty. */ static void binder_free_txn_fixups(struct binder_transaction *t) { struct binder_txn_fd_fixup *fixup, *tmp; list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { fput(fixup->file); if (fixup->target_fd >= 0) put_unused_fd(fixup->target_fd); list_del(&fixup->fixup_entry); kfree(fixup); } } static void binder_txn_latency_free(struct binder_transaction *t) { int from_proc, from_thread, to_proc, to_thread; spin_lock(&t->lock); from_proc = t->from ? t->from->proc->pid : 0; from_thread = t->from ? t->from->pid : 0; to_proc = t->to_proc ? t->to_proc->pid : 0; to_thread = t->to_thread ? t->to_thread->pid : 0; spin_unlock(&t->lock); trace_binder_txn_latency_free(t, from_proc, from_thread, to_proc, to_thread); } static void binder_free_transaction(struct binder_transaction *t) { struct binder_proc *target_proc = t->to_proc; if (target_proc) { binder_inner_proc_lock(target_proc); target_proc->outstanding_txns--; if (target_proc->outstanding_txns < 0) pr_warn("%s: Unexpected outstanding_txns %d\n", __func__, target_proc->outstanding_txns); if (!target_proc->outstanding_txns && target_proc->is_frozen) wake_up_interruptible_all(&target_proc->freeze_wait); if (t->buffer) t->buffer->transaction = NULL; binder_inner_proc_unlock(target_proc); } if (trace_binder_txn_latency_free_enabled()) binder_txn_latency_free(t); /* * If the transaction has no target_proc, then * t->buffer->transaction has already been cleared. */ binder_free_txn_fixups(t); kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); } static void binder_send_failed_reply(struct binder_transaction *t, uint32_t error_code) { struct binder_thread *target_thread; struct binder_transaction *next; BUG_ON(t->flags & TF_ONE_WAY); while (1) { target_thread = binder_get_txn_from_and_acq_inner(t); if (target_thread) { binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "send failed reply for transaction %d to %d:%d\n", t->debug_id, target_thread->proc->pid, target_thread->pid); binder_pop_transaction_ilocked(target_thread, t); if (target_thread->reply_error.cmd == BR_OK) { target_thread->reply_error.cmd = error_code; binder_enqueue_thread_work_ilocked( target_thread, &target_thread->reply_error.work); wake_up_interruptible(&target_thread->wait); } else { /* * Cannot get here for normal operation, but * we can if multiple synchronous transactions * are sent without blocking for responses. * Just ignore the 2nd error in this case. */ pr_warn("Unexpected reply error: %u\n", target_thread->reply_error.cmd); } binder_inner_proc_unlock(target_thread->proc); binder_thread_dec_tmpref(target_thread); binder_free_transaction(t); return; } __release(&target_thread->proc->inner_lock); next = t->from_parent; binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "send failed reply for transaction %d, target dead\n", t->debug_id); binder_free_transaction(t); if (next == NULL) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "reply failed, no target thread at root\n"); return; } t = next; binder_debug(BINDER_DEBUG_DEAD_BINDER, "reply failed, no target thread -- retry %d\n", t->debug_id); } } /** * binder_cleanup_transaction() - cleans up undelivered transaction * @t: transaction that needs to be cleaned up * @reason: reason the transaction wasn't delivered * @error_code: error to return to caller (if synchronous call) */ static void binder_cleanup_transaction(struct binder_transaction *t, const char *reason, uint32_t error_code) { if (t->buffer->target_node && !(t->flags & TF_ONE_WAY)) { binder_send_failed_reply(t, error_code); } else { binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered transaction %d, %s\n", t->debug_id, reason); binder_free_transaction(t); } } /** * binder_get_object() - gets object and checks for valid metadata * @proc: binder_proc owning the buffer * @u: sender's user pointer to base of buffer * @buffer: binder_buffer that we're parsing. * @offset: offset in the @buffer at which to validate an object. * @object: struct binder_object to read into * * Copy the binder object at the given offset into @object. If @u is * provided then the copy is from the sender's buffer. If not, then * it is copied from the target's @buffer. * * Return: If there's a valid metadata object at @offset, the * size of that object. Otherwise, it returns zero. The object * is read into the struct binder_object pointed to by @object. */ static size_t binder_get_object(struct binder_proc *proc, const void __user *u, struct binder_buffer *buffer, unsigned long offset, struct binder_object *object) { size_t read_size; struct binder_object_header *hdr; size_t object_size = 0; read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset); if (offset > buffer->data_size || read_size < sizeof(*hdr) || !IS_ALIGNED(offset, sizeof(u32))) return 0; if (u) { if (copy_from_user(object, u + offset, read_size)) return 0; } else { if (binder_alloc_copy_from_buffer(&proc->alloc, object, buffer, offset, read_size)) return 0; } /* Ok, now see if we read a complete object. */ hdr = &object->hdr; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: object_size = sizeof(struct flat_binder_object); break; case BINDER_TYPE_FD: object_size = sizeof(struct binder_fd_object); break; case BINDER_TYPE_PTR: object_size = sizeof(struct binder_buffer_object); break; case BINDER_TYPE_FDA: object_size = sizeof(struct binder_fd_array_object); break; default: return 0; } if (offset <= buffer->data_size - object_size && buffer->data_size >= object_size) return object_size; else return 0; } /** * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer. * @proc: binder_proc owning the buffer * @b: binder_buffer containing the object * @object: struct binder_object to read into * @index: index in offset array at which the binder_buffer_object is * located * @start_offset: points to the start of the offset array * @object_offsetp: offset of @object read from @b * @num_valid: the number of valid offsets in the offset array * * Return: If @index is within the valid range of the offset array * described by @start and @num_valid, and if there's a valid * binder_buffer_object at the offset found in index @index * of the offset array, that object is returned. Otherwise, * %NULL is returned. * Note that the offset found in index @index itself is not * verified; this function assumes that @num_valid elements * from @start were previously verified to have valid offsets. * If @object_offsetp is non-NULL, then the offset within * @b is written to it. */ static struct binder_buffer_object *binder_validate_ptr( struct binder_proc *proc, struct binder_buffer *b, struct binder_object *object, binder_size_t index, binder_size_t start_offset, binder_size_t *object_offsetp, binder_size_t num_valid) { size_t object_size; binder_size_t object_offset; unsigned long buffer_offset; if (index >= num_valid) return NULL; buffer_offset = start_offset + sizeof(binder_size_t) * index; if (binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, b, buffer_offset, sizeof(object_offset))) return NULL; object_size = binder_get_object(proc, NULL, b, object_offset, object); if (!object_size || object->hdr.type != BINDER_TYPE_PTR) return NULL; if (object_offsetp) *object_offsetp = object_offset; return &object->bbo; } /** * binder_validate_fixup() - validates pointer/fd fixups happen in order. * @proc: binder_proc owning the buffer * @b: transaction buffer * @objects_start_offset: offset to start of objects buffer * @buffer_obj_offset: offset to binder_buffer_object in which to fix up * @fixup_offset: start offset in @buffer to fix up * @last_obj_offset: offset to last binder_buffer_object that we fixed * @last_min_offset: minimum fixup offset in object at @last_obj_offset * * Return: %true if a fixup in buffer @buffer at offset @offset is * allowed. * * For safety reasons, we only allow fixups inside a buffer to happen * at increasing offsets; additionally, we only allow fixup on the last * buffer object that was verified, or one of its parents. * * Example of what is allowed: * * A * B (parent = A, offset = 0) * C (parent = A, offset = 16) * D (parent = C, offset = 0) * E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset) * * Examples of what is not allowed: * * Decreasing offsets within the same parent: * A * C (parent = A, offset = 16) * B (parent = A, offset = 0) // decreasing offset within A * * Referring to a parent that wasn't the last object or any of its parents: * A * B (parent = A, offset = 0) * C (parent = A, offset = 0) * C (parent = A, offset = 16) * D (parent = B, offset = 0) // B is not A or any of A's parents */ static bool binder_validate_fixup(struct binder_proc *proc, struct binder_buffer *b, binder_size_t objects_start_offset, binder_size_t buffer_obj_offset, binder_size_t fixup_offset, binder_size_t last_obj_offset, binder_size_t last_min_offset) { if (!last_obj_offset) { /* Nothing to fix up in */ return false; } while (last_obj_offset != buffer_obj_offset) { unsigned long buffer_offset; struct binder_object last_object; struct binder_buffer_object *last_bbo; size_t object_size = binder_get_object(proc, NULL, b, last_obj_offset, &last_object); if (object_size != sizeof(*last_bbo)) return false; last_bbo = &last_object.bbo; /* * Safe to retrieve the parent of last_obj, since it * was already previously verified by the driver. */ if ((last_bbo->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0) return false; last_min_offset = last_bbo->parent_offset + sizeof(uintptr_t); buffer_offset = objects_start_offset + sizeof(binder_size_t) * last_bbo->parent; if (binder_alloc_copy_from_buffer(&proc->alloc, &last_obj_offset, b, buffer_offset, sizeof(last_obj_offset))) return false; } return (fixup_offset >= last_min_offset); } /** * struct binder_task_work_cb - for deferred close * * @twork: callback_head for task work * @fd: fd to close * * Structure to pass task work to be handled after * returning from binder_ioctl() via task_work_add(). */ struct binder_task_work_cb { struct callback_head twork; struct file *file; }; /** * binder_do_fd_close() - close list of file descriptors * @twork: callback head for task work * * It is not safe to call ksys_close() during the binder_ioctl() * function if there is a chance that binder's own file descriptor * might be closed. This is to meet the requirements for using * fdget() (see comments for __fget_light()). Therefore use * task_work_add() to schedule the close operation once we have * returned from binder_ioctl(). This function is a callback * for that mechanism and does the actual ksys_close() on the * given file descriptor. */ static void binder_do_fd_close(struct callback_head *twork) { struct binder_task_work_cb *twcb = container_of(twork, struct binder_task_work_cb, twork); fput(twcb->file); kfree(twcb); } /** * binder_deferred_fd_close() - schedule a close for the given file-descriptor * @fd: file-descriptor to close * * See comments in binder_do_fd_close(). This function is used to schedule * a file-descriptor to be closed after returning from binder_ioctl(). */ static void binder_deferred_fd_close(int fd) { struct binder_task_work_cb *twcb; twcb = kzalloc(sizeof(*twcb), GFP_KERNEL); if (!twcb) return; init_task_work(&twcb->twork, binder_do_fd_close); twcb->file = file_close_fd(fd); if (twcb->file) { // pin it until binder_do_fd_close(); see comments there get_file(twcb->file); filp_close(twcb->file, current->files); task_work_add(current, &twcb->twork, TWA_RESUME); } else { kfree(twcb); } } static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, binder_size_t off_end_offset, bool is_failure) { int debug_id = buffer->debug_id; binder_size_t off_start_offset, buffer_offset; binder_debug(BINDER_DEBUG_TRANSACTION, "%d buffer release %d, size %zd-%zd, failed at %llx\n", proc->pid, buffer->debug_id, buffer->data_size, buffer->offsets_size, (unsigned long long)off_end_offset); if (buffer->target_node) binder_dec_node(buffer->target_node, 1, 0); off_start_offset = ALIGN(buffer->data_size, sizeof(void *)); for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; size_t object_size = 0; struct binder_object object; binder_size_t object_offset; if (!binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, buffer, buffer_offset, sizeof(object_offset))) object_size = binder_get_object(proc, NULL, buffer, object_offset, &object); if (object_size == 0) { pr_err("transaction release %d bad object at offset %lld, size %zd\n", debug_id, (u64)object_offset, buffer->data_size); continue; } hdr = &object.hdr; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { struct flat_binder_object *fp; struct binder_node *node; fp = to_flat_binder_object(hdr); node = binder_get_node(proc, fp->binder); if (node == NULL) { pr_err("transaction release %d bad node %016llx\n", debug_id, (u64)fp->binder); break; } binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%016llx\n", node->debug_id, (u64)node->ptr); binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER, 0); binder_put_node(node); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { struct flat_binder_object *fp; struct binder_ref_data rdata; int ret; fp = to_flat_binder_object(hdr); ret = binder_dec_ref_for_handle(proc, fp->handle, hdr->type == BINDER_TYPE_HANDLE, &rdata); if (ret) { pr_err("transaction release %d bad handle %d, ret = %d\n", debug_id, fp->handle, ret); break; } binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d\n", rdata.debug_id, rdata.desc); } break; case BINDER_TYPE_FD: { /* * No need to close the file here since user-space * closes it for successfully delivered * transactions. For transactions that weren't * delivered, the new fd was never allocated so * there is no need to close and the fput on the * file is done when the transaction is torn * down. */ } break; case BINDER_TYPE_PTR: /* * Nothing to do here, this will get cleaned up when the * transaction buffer gets freed */ break; case BINDER_TYPE_FDA: { struct binder_fd_array_object *fda; struct binder_buffer_object *parent; struct binder_object ptr_object; binder_size_t fda_offset; size_t fd_index; binder_size_t fd_buf_size; binder_size_t num_valid; if (is_failure) { /* * The fd fixups have not been applied so no * fds need to be closed. */ continue; } num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); fda = to_binder_fd_array_object(hdr); parent = binder_validate_ptr(proc, buffer, &ptr_object, fda->parent, off_start_offset, NULL, num_valid); if (!parent) { pr_err("transaction release %d bad parent offset\n", debug_id); continue; } fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { pr_err("transaction release %d invalid number of fds (%lld)\n", debug_id, (u64)fda->num_fds); continue; } if (fd_buf_size > parent->length || fda->parent_offset > parent->length - fd_buf_size) { /* No space for all file descriptors here. */ pr_err("transaction release %d not enough space for %lld fds in buffer\n", debug_id, (u64)fda->num_fds); continue; } /* * the source data for binder_buffer_object is visible * to user-space and the @buffer element is the user * pointer to the buffer_object containing the fd_array. * Convert the address to an offset relative to * the base of the transaction buffer. */ fda_offset = parent->buffer - buffer->user_data + fda->parent_offset; for (fd_index = 0; fd_index < fda->num_fds; fd_index++) { u32 fd; int err; binder_size_t offset = fda_offset + fd_index * sizeof(fd); err = binder_alloc_copy_from_buffer( &proc->alloc, &fd, buffer, offset, sizeof(fd)); WARN_ON(err); if (!err) { binder_deferred_fd_close(fd); /* * Need to make sure the thread goes * back to userspace to complete the * deferred close */ if (thread) thread->looper_need_return = true; } } } break; default: pr_err("transaction release %d bad object type %x\n", debug_id, hdr->type); break; } } } /* Clean up all the objects in the buffer */ static inline void binder_release_entire_buffer(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, bool is_failure) { binder_size_t off_end_offset; off_end_offset = ALIGN(buffer->data_size, sizeof(void *)); off_end_offset += buffer->offsets_size; binder_transaction_buffer_release(proc, thread, buffer, off_end_offset, is_failure); } static int binder_translate_binder(struct flat_binder_object *fp, struct binder_transaction *t, struct binder_thread *thread) { struct binder_node *node; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_ref_data rdata; int ret = 0; node = binder_get_node(proc, fp->binder); if (!node) { node = binder_new_node(proc, fp); if (!node) return -ENOMEM; } if (fp->cookie != node->cookie) { binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, (u64)fp->binder, node->debug_id, (u64)fp->cookie, (u64)node->cookie); ret = -EINVAL; goto done; } if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } ret = binder_inc_ref_for_node(target_proc, node, fp->hdr.type == BINDER_TYPE_BINDER, &thread->todo, &rdata); if (ret) goto done; if (fp->hdr.type == BINDER_TYPE_BINDER) fp->hdr.type = BINDER_TYPE_HANDLE; else fp->hdr.type = BINDER_TYPE_WEAK_HANDLE; fp->binder = 0; fp->handle = rdata.desc; fp->cookie = 0; trace_binder_transaction_node_to_ref(t, node, &rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%016llx -> ref %d desc %d\n", node->debug_id, (u64)node->ptr, rdata.debug_id, rdata.desc); done: binder_put_node(node); return ret; } static int binder_translate_handle(struct flat_binder_object *fp, struct binder_transaction *t, struct binder_thread *thread) { struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_node *node; struct binder_ref_data src_rdata; int ret = 0; node = binder_get_node_from_ref(proc, fp->handle, fp->hdr.type == BINDER_TYPE_HANDLE, &src_rdata); if (!node) { binder_user_error("%d:%d got transaction with invalid handle, %d\n", proc->pid, thread->pid, fp->handle); return -EINVAL; } if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } binder_node_lock(node); if (node->proc == target_proc) { if (fp->hdr.type == BINDER_TYPE_HANDLE) fp->hdr.type = BINDER_TYPE_BINDER; else fp->hdr.type = BINDER_TYPE_WEAK_BINDER; fp->binder = node->ptr; fp->cookie = node->cookie; if (node->proc) binder_inner_proc_lock(node->proc); else __acquire(&node->proc->inner_lock); binder_inc_node_nilocked(node, fp->hdr.type == BINDER_TYPE_BINDER, 0, NULL); if (node->proc) binder_inner_proc_unlock(node->proc); else __release(&node->proc->inner_lock); trace_binder_transaction_ref_to_node(t, node, &src_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> node %d u%016llx\n", src_rdata.debug_id, src_rdata.desc, node->debug_id, (u64)node->ptr); binder_node_unlock(node); } else { struct binder_ref_data dest_rdata; binder_node_unlock(node); ret = binder_inc_ref_for_node(target_proc, node, fp->hdr.type == BINDER_TYPE_HANDLE, NULL, &dest_rdata); if (ret) goto done; fp->binder = 0; fp->handle = dest_rdata.desc; fp->cookie = 0; trace_binder_transaction_ref_to_ref(t, node, &src_rdata, &dest_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> ref %d desc %d (node %d)\n", src_rdata.debug_id, src_rdata.desc, dest_rdata.debug_id, dest_rdata.desc, node->debug_id); } done: binder_put_node(node); return ret; } static int binder_translate_fd(u32 fd, binder_size_t fd_offset, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_txn_fd_fixup *fixup; struct file *file; int ret = 0; bool target_allows_fd; if (in_reply_to) target_allows_fd = !!(in_reply_to->flags & TF_ACCEPT_FDS); else target_allows_fd = t->buffer->target_node->accept_fds; if (!target_allows_fd) { binder_user_error("%d:%d got %s with fd, %d, but target does not allow fds\n", proc->pid, thread->pid, in_reply_to ? "reply" : "transaction", fd); ret = -EPERM; goto err_fd_not_accepted; } file = fget(fd); if (!file) { binder_user_error("%d:%d got transaction with invalid fd, %d\n", proc->pid, thread->pid, fd); ret = -EBADF; goto err_fget; } ret = security_binder_transfer_file(proc->cred, target_proc->cred, file); if (ret < 0) { ret = -EPERM; goto err_security; } /* * Add fixup record for this transaction. The allocation * of the fd in the target needs to be done from a * target thread. */ fixup = kzalloc(sizeof(*fixup), GFP_KERNEL); if (!fixup) { ret = -ENOMEM; goto err_alloc; } fixup->file = file; fixup->offset = fd_offset; fixup->target_fd = -1; trace_binder_transaction_fd_send(t, fd, fixup->offset); list_add_tail(&fixup->fixup_entry, &t->fd_fixups); return ret; err_alloc: err_security: fput(file); err_fget: err_fd_not_accepted: return ret; } /** * struct binder_ptr_fixup - data to be fixed-up in target buffer * @offset offset in target buffer to fixup * @skip_size bytes to skip in copy (fixup will be written later) * @fixup_data data to write at fixup offset * @node list node * * This is used for the pointer fixup list (pf) which is created and consumed * during binder_transaction() and is only accessed locally. No * locking is necessary. * * The list is ordered by @offset. */ struct binder_ptr_fixup { binder_size_t offset; size_t skip_size; binder_uintptr_t fixup_data; struct list_head node; }; /** * struct binder_sg_copy - scatter-gather data to be copied * @offset offset in target buffer * @sender_uaddr user address in source buffer * @length bytes to copy * @node list node * * This is used for the sg copy list (sgc) which is created and consumed * during binder_transaction() and is only accessed locally. No * locking is necessary. * * The list is ordered by @offset. */ struct binder_sg_copy { binder_size_t offset; const void __user *sender_uaddr; size_t length; struct list_head node; }; /** * binder_do_deferred_txn_copies() - copy and fixup scatter-gather data * @alloc: binder_alloc associated with @buffer * @buffer: binder buffer in target process * @sgc_head: list_head of scatter-gather copy list * @pf_head: list_head of pointer fixup list * * Processes all elements of @sgc_head, applying fixups from @pf_head * and copying the scatter-gather data from the source process' user * buffer to the target's buffer. It is expected that the list creation * and processing all occurs during binder_transaction() so these lists * are only accessed in local context. * * Return: 0=success, else -errno */ static int binder_do_deferred_txn_copies(struct binder_alloc *alloc, struct binder_buffer *buffer, struct list_head *sgc_head, struct list_head *pf_head) { int ret = 0; struct binder_sg_copy *sgc, *tmpsgc; struct binder_ptr_fixup *tmppf; struct binder_ptr_fixup *pf = list_first_entry_or_null(pf_head, struct binder_ptr_fixup, node); list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { size_t bytes_copied = 0; while (bytes_copied < sgc->length) { size_t copy_size; size_t bytes_left = sgc->length - bytes_copied; size_t offset = sgc->offset + bytes_copied; /* * We copy up to the fixup (pointed to by pf) */ copy_size = pf ? min(bytes_left, (size_t)pf->offset - offset) : bytes_left; if (!ret && copy_size) ret = binder_alloc_copy_user_to_buffer( alloc, buffer, offset, sgc->sender_uaddr + bytes_copied, copy_size); bytes_copied += copy_size; if (copy_size != bytes_left) { BUG_ON(!pf); /* we stopped at a fixup offset */ if (pf->skip_size) { /* * we are just skipping. This is for * BINDER_TYPE_FDA where the translated * fds will be fixed up when we get * to target context. */ bytes_copied += pf->skip_size; } else { /* apply the fixup indicated by pf */ if (!ret) ret = binder_alloc_copy_to_buffer( alloc, buffer, pf->offset, &pf->fixup_data, sizeof(pf->fixup_data)); bytes_copied += sizeof(pf->fixup_data); } list_del(&pf->node); kfree(pf); pf = list_first_entry_or_null(pf_head, struct binder_ptr_fixup, node); } } list_del(&sgc->node); kfree(sgc); } list_for_each_entry_safe(pf, tmppf, pf_head, node) { BUG_ON(pf->skip_size == 0); list_del(&pf->node); kfree(pf); } BUG_ON(!list_empty(sgc_head)); return ret > 0 ? -EINVAL : ret; } /** * binder_cleanup_deferred_txn_lists() - free specified lists * @sgc_head: list_head of scatter-gather copy list * @pf_head: list_head of pointer fixup list * * Called to clean up @sgc_head and @pf_head if there is an * error. */ static void binder_cleanup_deferred_txn_lists(struct list_head *sgc_head, struct list_head *pf_head) { struct binder_sg_copy *sgc, *tmpsgc; struct binder_ptr_fixup *pf, *tmppf; list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { list_del(&sgc->node); kfree(sgc); } list_for_each_entry_safe(pf, tmppf, pf_head, node) { list_del(&pf->node); kfree(pf); } } /** * binder_defer_copy() - queue a scatter-gather buffer for copy * @sgc_head: list_head of scatter-gather copy list * @offset: binder buffer offset in target process * @sender_uaddr: user address in source process * @length: bytes to copy * * Specify a scatter-gather block to be copied. The actual copy must * be deferred until all the needed fixups are identified and queued. * Then the copy and fixups are done together so un-translated values * from the source are never visible in the target buffer. * * We are guaranteed that repeated calls to this function will have * monotonically increasing @offset values so the list will naturally * be ordered. * * Return: 0=success, else -errno */ static int binder_defer_copy(struct list_head *sgc_head, binder_size_t offset, const void __user *sender_uaddr, size_t length) { struct binder_sg_copy *bc = kzalloc(sizeof(*bc), GFP_KERNEL); if (!bc) return -ENOMEM; bc->offset = offset; bc->sender_uaddr = sender_uaddr; bc->length = length; INIT_LIST_HEAD(&bc->node); /* * We are guaranteed that the deferred copies are in-order * so just add to the tail. */ list_add_tail(&bc->node, sgc_head); return 0; } /** * binder_add_fixup() - queue a fixup to be applied to sg copy * @pf_head: list_head of binder ptr fixup list * @offset: binder buffer offset in target process * @fixup: bytes to be copied for fixup * @skip_size: bytes to skip when copying (fixup will be applied later) * * Add the specified fixup to a list ordered by @offset. When copying * the scatter-gather buffers, the fixup will be copied instead of * data from the source buffer. For BINDER_TYPE_FDA fixups, the fixup * will be applied later (in target process context), so we just skip * the bytes specified by @skip_size. If @skip_size is 0, we copy the * value in @fixup. * * This function is called *mostly* in @offset order, but there are * exceptions. Since out-of-order inserts are relatively uncommon, * we insert the new element by searching backward from the tail of * the list. * * Return: 0=success, else -errno */ static int binder_add_fixup(struct list_head *pf_head, binder_size_t offset, binder_uintptr_t fixup, size_t skip_size) { struct binder_ptr_fixup *pf = kzalloc(sizeof(*pf), GFP_KERNEL); struct binder_ptr_fixup *tmppf; if (!pf) return -ENOMEM; pf->offset = offset; pf->fixup_data = fixup; pf->skip_size = skip_size; INIT_LIST_HEAD(&pf->node); /* Fixups are *mostly* added in-order, but there are some * exceptions. Look backwards through list for insertion point. */ list_for_each_entry_reverse(tmppf, pf_head, node) { if (tmppf->offset < pf->offset) { list_add(&pf->node, &tmppf->node); return 0; } } /* * if we get here, then the new offset is the lowest so * insert at the head */ list_add(&pf->node, pf_head); return 0; } static int binder_translate_fd_array(struct list_head *pf_head, struct binder_fd_array_object *fda, const void __user *sender_ubuffer, struct binder_buffer_object *parent, struct binder_buffer_object *sender_uparent, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { binder_size_t fdi, fd_buf_size; binder_size_t fda_offset; const void __user *sender_ufda_base; struct binder_proc *proc = thread->proc; int ret; if (fda->num_fds == 0) return 0; fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { binder_user_error("%d:%d got transaction with invalid number of fds (%lld)\n", proc->pid, thread->pid, (u64)fda->num_fds); return -EINVAL; } if (fd_buf_size > parent->length || fda->parent_offset > parent->length - fd_buf_size) { /* No space for all file descriptors here. */ binder_user_error("%d:%d not enough space to store %lld fds in buffer\n", proc->pid, thread->pid, (u64)fda->num_fds); return -EINVAL; } /* * the source data for binder_buffer_object is visible * to user-space and the @buffer element is the user * pointer to the buffer_object containing the fd_array. * Convert the address to an offset relative to * the base of the transaction buffer. */ fda_offset = parent->buffer - t->buffer->user_data + fda->parent_offset; sender_ufda_base = (void __user *)(uintptr_t)sender_uparent->buffer + fda->parent_offset; if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32)) || !IS_ALIGNED((unsigned long)sender_ufda_base, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", proc->pid, thread->pid); return -EINVAL; } ret = binder_add_fixup(pf_head, fda_offset, 0, fda->num_fds * sizeof(u32)); if (ret) return ret; for (fdi = 0; fdi < fda->num_fds; fdi++) { u32 fd; binder_size_t offset = fda_offset + fdi * sizeof(fd); binder_size_t sender_uoffset = fdi * sizeof(fd); ret = copy_from_user(&fd, sender_ufda_base + sender_uoffset, sizeof(fd)); if (!ret) ret = binder_translate_fd(fd, offset, t, thread, in_reply_to); if (ret) return ret > 0 ? -EINVAL : ret; } return 0; } static int binder_fixup_parent(struct list_head *pf_head, struct binder_transaction *t, struct binder_thread *thread, struct binder_buffer_object *bp, binder_size_t off_start_offset, binder_size_t num_valid, binder_size_t last_fixup_obj_off, binder_size_t last_fixup_min_off) { struct binder_buffer_object *parent; struct binder_buffer *b = t->buffer; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_object object; binder_size_t buffer_offset; binder_size_t parent_offset; if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT)) return 0; parent = binder_validate_ptr(target_proc, b, &object, bp->parent, off_start_offset, &parent_offset, num_valid); if (!parent) { binder_user_error("%d:%d got transaction with invalid parent offset or type\n", proc->pid, thread->pid); return -EINVAL; } if (!binder_validate_fixup(target_proc, b, off_start_offset, parent_offset, bp->parent_offset, last_fixup_obj_off, last_fixup_min_off)) { binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", proc->pid, thread->pid); return -EINVAL; } if (parent->length < sizeof(binder_uintptr_t) || bp->parent_offset > parent->length - sizeof(binder_uintptr_t)) { /* No space for a pointer here! */ binder_user_error("%d:%d got transaction with invalid parent offset\n", proc->pid, thread->pid); return -EINVAL; } buffer_offset = bp->parent_offset + parent->buffer - b->user_data; return binder_add_fixup(pf_head, buffer_offset, bp->buffer, 0); } /** * binder_can_update_transaction() - Can a txn be superseded by an updated one? * @t1: the pending async txn in the frozen process * @t2: the new async txn to supersede the outdated pending one * * Return: true if t2 can supersede t1 * false if t2 can not supersede t1 */ static bool binder_can_update_transaction(struct binder_transaction *t1, struct binder_transaction *t2) { if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) != (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc) return false; if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code && t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid && t1->buffer->target_node->ptr == t2->buffer->target_node->ptr && t1->buffer->target_node->cookie == t2->buffer->target_node->cookie) return true; return false; } /** * binder_find_outdated_transaction_ilocked() - Find the outdated transaction * @t: new async transaction * @target_list: list to find outdated transaction * * Return: the outdated transaction if found * NULL if no outdated transacton can be found * * Requires the proc->inner_lock to be held. */ static struct binder_transaction * binder_find_outdated_transaction_ilocked(struct binder_transaction *t, struct list_head *target_list) { struct binder_work *w; list_for_each_entry(w, target_list, entry) { struct binder_transaction *t_queued; if (w->type != BINDER_WORK_TRANSACTION) continue; t_queued = container_of(w, struct binder_transaction, work); if (binder_can_update_transaction(t_queued, t)) return t_queued; } return NULL; } /** * binder_proc_transaction() - sends a transaction to a process and wakes it up * @t: transaction to send * @proc: process to send the transaction to * @thread: thread in @proc to send the transaction to (may be NULL) * * This function queues a transaction to the specified process. It will try * to find a thread in the target process to handle the transaction and * wake it up. If no thread is found, the work is queued to the proc * waitqueue. * * If the @thread parameter is not NULL, the transaction is always queued * to the waitlist of that specific thread. * * Return: 0 if the transaction was successfully queued * BR_DEAD_REPLY if the target process or thread is dead * BR_FROZEN_REPLY if the target process or thread is frozen and * the sync transaction was rejected * BR_TRANSACTION_PENDING_FROZEN if the target process is frozen * and the async transaction was successfully queued */ static int binder_proc_transaction(struct binder_transaction *t, struct binder_proc *proc, struct binder_thread *thread) { struct binder_node *node = t->buffer->target_node; bool oneway = !!(t->flags & TF_ONE_WAY); bool pending_async = false; struct binder_transaction *t_outdated = NULL; bool frozen = false; BUG_ON(!node); binder_node_lock(node); if (oneway) { BUG_ON(thread); if (node->has_async_transaction) pending_async = true; else node->has_async_transaction = true; } binder_inner_proc_lock(proc); if (proc->is_frozen) { frozen = true; proc->sync_recv |= !oneway; proc->async_recv |= oneway; } if ((frozen && !oneway) || proc->is_dead || (thread && thread->is_dead)) { binder_inner_proc_unlock(proc); binder_node_unlock(node); return frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY; } if (!thread && !pending_async) thread = binder_select_thread_ilocked(proc); if (thread) { binder_enqueue_thread_work_ilocked(thread, &t->work); } else if (!pending_async) { binder_enqueue_work_ilocked(&t->work, &proc->todo); } else { if ((t->flags & TF_UPDATE_TXN) && frozen) { t_outdated = binder_find_outdated_transaction_ilocked(t, &node->async_todo); if (t_outdated) { binder_debug(BINDER_DEBUG_TRANSACTION, "txn %d supersedes %d\n", t->debug_id, t_outdated->debug_id); list_del_init(&t_outdated->work.entry); proc->outstanding_txns--; } } binder_enqueue_work_ilocked(&t->work, &node->async_todo); } if (!pending_async) binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */); proc->outstanding_txns++; binder_inner_proc_unlock(proc); binder_node_unlock(node); /* * To reduce potential contention, free the outdated transaction and * buffer after releasing the locks. */ if (t_outdated) { struct binder_buffer *buffer = t_outdated->buffer; t_outdated->buffer = NULL; buffer->transaction = NULL; trace_binder_transaction_update_buffer_release(buffer); binder_release_entire_buffer(proc, NULL, buffer, false); binder_alloc_free_buf(&proc->alloc, buffer); kfree(t_outdated); binder_stats_deleted(BINDER_STAT_TRANSACTION); } if (oneway && frozen) return BR_TRANSACTION_PENDING_FROZEN; return 0; } /** * binder_get_node_refs_for_txn() - Get required refs on node for txn * @node: struct binder_node for which to get refs * @procp: returns @node->proc if valid * @error: if no @procp then returns BR_DEAD_REPLY * * User-space normally keeps the node alive when creating a transaction * since it has a reference to the target. The local strong ref keeps it * alive if the sending process dies before the target process processes * the transaction. If the source process is malicious or has a reference * counting bug, relying on the local strong ref can fail. * * Since user-space can cause the local strong ref to go away, we also take * a tmpref on the node to ensure it survives while we are constructing * the transaction. We also need a tmpref on the proc while we are * constructing the transaction, so we take that here as well. * * Return: The target_node with refs taken or NULL if no @node->proc is NULL. * Also sets @procp if valid. If the @node->proc is NULL indicating that the * target proc has died, @error is set to BR_DEAD_REPLY. */ static struct binder_node *binder_get_node_refs_for_txn( struct binder_node *node, struct binder_proc **procp, uint32_t *error) { struct binder_node *target_node = NULL; binder_node_inner_lock(node); if (node->proc) { target_node = node; binder_inc_node_nilocked(node, 1, 0, NULL); binder_inc_node_tmpref_ilocked(node); node->proc->tmp_ref++; *procp = node->proc; } else *error = BR_DEAD_REPLY; binder_node_inner_unlock(node); return target_node; } static void binder_set_txn_from_error(struct binder_transaction *t, int id, uint32_t command, int32_t param) { struct binder_thread *from = binder_get_txn_from_and_acq_inner(t); if (!from) { /* annotation for sparse */ __release(&from->proc->inner_lock); return; } /* don't override existing errors */ if (from->ee.command == BR_OK) binder_set_extended_error(&from->ee, id, command, param); binder_inner_proc_unlock(from->proc); binder_thread_dec_tmpref(from); } static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, binder_size_t extra_buffers_size) { int ret; struct binder_transaction *t; struct binder_work *w; struct binder_work *tcomplete; binder_size_t buffer_offset = 0; binder_size_t off_start_offset, off_end_offset; binder_size_t off_min; binder_size_t sg_buf_offset, sg_buf_end_offset; binder_size_t user_offset = 0; struct binder_proc *target_proc = NULL; struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; struct binder_transaction *in_reply_to = NULL; struct binder_transaction_log_entry *e; uint32_t return_error = 0; uint32_t return_error_param = 0; uint32_t return_error_line = 0; binder_size_t last_fixup_obj_off = 0; binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; int t_debug_id = atomic_inc_return(&binder_last_id); ktime_t t_start_time = ktime_get(); char *secctx = NULL; u32 secctx_sz = 0; struct list_head sgc_head; struct list_head pf_head; const void __user *user_buffer = (const void __user *) (uintptr_t)tr->data.ptr.buffer; INIT_LIST_HEAD(&sgc_head); INIT_LIST_HEAD(&pf_head); e = binder_transaction_log_add(&binder_transaction_log); e->debug_id = t_debug_id; e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY); e->from_proc = proc->pid; e->from_thread = thread->pid; e->target_handle = tr->target.handle; e->data_size = tr->data_size; e->offsets_size = tr->offsets_size; strscpy(e->context_name, proc->context->name, BINDERFS_MAX_NAME); binder_inner_proc_lock(proc); binder_set_extended_error(&thread->ee, t_debug_id, BR_OK, 0); binder_inner_proc_unlock(proc); if (reply) { binder_inner_proc_lock(proc); in_reply_to = thread->transaction_stack; if (in_reply_to == NULL) { binder_inner_proc_unlock(proc); binder_user_error("%d:%d got reply transaction with no transaction stack\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_empty_call_stack; } if (in_reply_to->to_thread != thread) { spin_lock(&in_reply_to->lock); binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n", proc->pid, thread->pid, in_reply_to->debug_id, in_reply_to->to_proc ? in_reply_to->to_proc->pid : 0, in_reply_to->to_thread ? in_reply_to->to_thread->pid : 0); spin_unlock(&in_reply_to->lock); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; in_reply_to = NULL; goto err_bad_call_stack; } thread->transaction_stack = in_reply_to->to_parent; binder_inner_proc_unlock(proc); binder_set_nice(in_reply_to->saved_priority); target_thread = binder_get_txn_from_and_acq_inner(in_reply_to); if (target_thread == NULL) { /* annotation for sparse */ __release(&target_thread->proc->inner_lock); binder_txn_error("%d:%d reply target not found\n", thread->pid, proc->pid); return_error = BR_DEAD_REPLY; return_error_line = __LINE__; goto err_dead_binder; } if (target_thread->transaction_stack != in_reply_to) { binder_user_error("%d:%d got reply transaction with bad target transaction stack %d, expected %d\n", proc->pid, thread->pid, target_thread->transaction_stack ? target_thread->transaction_stack->debug_id : 0, in_reply_to->debug_id); binder_inner_proc_unlock(target_thread->proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; in_reply_to = NULL; target_thread = NULL; goto err_dead_binder; } target_proc = target_thread->proc; target_proc->tmp_ref++; binder_inner_proc_unlock(target_thread->proc); } else { if (tr->target.handle) { struct binder_ref *ref; /* * There must already be a strong ref * on this node. If so, do a strong * increment on the node to ensure it * stays alive until the transaction is * done. */ binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, tr->target.handle, true); if (ref) { target_node = binder_get_node_refs_for_txn( ref->node, &target_proc, &return_error); } else { binder_user_error("%d:%d got transaction to invalid handle, %u\n", proc->pid, thread->pid, tr->target.handle); return_error = BR_FAILED_REPLY; } binder_proc_unlock(proc); } else { mutex_lock(&context->context_mgr_node_lock); target_node = context->binder_context_mgr_node; if (target_node) target_node = binder_get_node_refs_for_txn( target_node, &target_proc, &return_error); else return_error = BR_DEAD_REPLY; mutex_unlock(&context->context_mgr_node_lock); if (target_node && target_proc->pid == proc->pid) { binder_user_error("%d:%d got transaction to context manager from process owning it\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_invalid_target_handle; } } if (!target_node) { binder_txn_error("%d:%d cannot find target node\n", thread->pid, proc->pid); /* * return_error is set above */ return_error_param = -EINVAL; return_error_line = __LINE__; goto err_dead_binder; } e->to_node = target_node->debug_id; if (WARN_ON(proc == target_proc)) { binder_txn_error("%d:%d self transactions not allowed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_invalid_target_handle; } if (security_binder_transaction(proc->cred, target_proc->cred) < 0) { binder_txn_error("%d:%d transaction credentials failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EPERM; return_error_line = __LINE__; goto err_invalid_target_handle; } binder_inner_proc_lock(proc); w = list_first_entry_or_null(&thread->todo, struct binder_work, entry); if (!(tr->flags & TF_ONE_WAY) && w && w->type == BINDER_WORK_TRANSACTION) { /* * Do not allow new outgoing transaction from a * thread that has a transaction at the head of * its todo list. Only need to check the head * because binder_select_thread_ilocked picks a * thread from proc->waiting_threads to enqueue * the transaction, and nothing is queued to the * todo list while the thread is on waiting_threads. */ binder_user_error("%d:%d new transaction not allowed when there is a transaction on thread todo\n", proc->pid, thread->pid); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_bad_todo_list; } if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) { struct binder_transaction *tmp; tmp = thread->transaction_stack; if (tmp->to_thread != thread) { spin_lock(&tmp->lock); binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n", proc->pid, thread->pid, tmp->debug_id, tmp->to_proc ? tmp->to_proc->pid : 0, tmp->to_thread ? tmp->to_thread->pid : 0); spin_unlock(&tmp->lock); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_bad_call_stack; } while (tmp) { struct binder_thread *from; spin_lock(&tmp->lock); from = tmp->from; if (from && from->proc == target_proc) { atomic_inc(&from->tmp_ref); target_thread = from; spin_unlock(&tmp->lock); break; } spin_unlock(&tmp->lock); tmp = tmp->from_parent; } } binder_inner_proc_unlock(proc); } if (target_thread) e->to_thread = target_thread->pid; e->to_proc = target_proc->pid; /* TODO: reuse incoming transaction for reply */ t = kzalloc(sizeof(*t), GFP_KERNEL); if (t == NULL) { binder_txn_error("%d:%d cannot allocate transaction\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; return_error_line = __LINE__; goto err_alloc_t_failed; } INIT_LIST_HEAD(&t->fd_fixups); binder_stats_created(BINDER_STAT_TRANSACTION); spin_lock_init(&t->lock); tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL); if (tcomplete == NULL) { binder_txn_error("%d:%d cannot allocate work for transaction\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; return_error_line = __LINE__; goto err_alloc_tcomplete_failed; } binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE); t->debug_id = t_debug_id; t->start_time = t_start_time; if (reply) binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_thread->pid, (u64)tr->data.ptr.buffer, (u64)tr->data.ptr.offsets, (u64)tr->data_size, (u64)tr->offsets_size, (u64)extra_buffers_size); else binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_node->debug_id, (u64)tr->data.ptr.buffer, (u64)tr->data.ptr.offsets, (u64)tr->data_size, (u64)tr->offsets_size, (u64)extra_buffers_size); if (!reply && !(tr->flags & TF_ONE_WAY)) t->from = thread; else t->from = NULL; t->from_pid = proc->pid; t->from_tid = thread->pid; t->sender_euid = task_euid(proc->tsk); t->to_proc = target_proc; t->to_thread = target_thread; t->code = tr->code; t->flags = tr->flags; t->priority = task_nice(current); if (target_node && target_node->txn_security_ctx) { u32 secid; size_t added_size; security_cred_getsecid(proc->cred, &secid); ret = security_secid_to_secctx(secid, &secctx, &secctx_sz); if (ret) { binder_txn_error("%d:%d failed to get security context\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_get_secctx_failed; } added_size = ALIGN(secctx_sz, sizeof(u64)); extra_buffers_size += added_size; if (extra_buffers_size < added_size) { binder_txn_error("%d:%d integer overflow of extra_buffers_size\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_extra_size; } } trace_binder_transaction(reply, t, target_node); t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size, tr->offsets_size, extra_buffers_size, !reply && (t->flags & TF_ONE_WAY)); if (IS_ERR(t->buffer)) { char *s; ret = PTR_ERR(t->buffer); s = (ret == -ESRCH) ? ": vma cleared, target dead or dying" : (ret == -ENOSPC) ? ": no space left" : (ret == -ENOMEM) ? ": memory allocation failed" : ""; binder_txn_error("cannot allocate buffer%s", s); return_error_param = PTR_ERR(t->buffer); return_error = return_error_param == -ESRCH ? BR_DEAD_REPLY : BR_FAILED_REPLY; return_error_line = __LINE__; t->buffer = NULL; goto err_binder_alloc_buf_failed; } if (secctx) { int err; size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) + ALIGN(tr->offsets_size, sizeof(void *)) + ALIGN(extra_buffers_size, sizeof(void *)) - ALIGN(secctx_sz, sizeof(u64)); t->security_ctx = t->buffer->user_data + buf_offset; err = binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, buf_offset, secctx, secctx_sz); if (err) { t->security_ctx = 0; WARN_ON(1); } security_release_secctx(secctx, secctx_sz); secctx = NULL; } t->buffer->debug_id = t->debug_id; t->buffer->transaction = t; t->buffer->target_node = target_node; t->buffer->clear_on_free = !!(t->flags & TF_CLEAR_BUF); trace_binder_transaction_alloc_buf(t->buffer); if (binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, ALIGN(tr->data_size, sizeof(void *)), (const void __user *) (uintptr_t)tr->data.ptr.offsets, tr->offsets_size)) { binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } if (!IS_ALIGNED(tr->offsets_size, sizeof(binder_size_t))) { binder_user_error("%d:%d got transaction with invalid offsets size, %lld\n", proc->pid, thread->pid, (u64)tr->offsets_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) { binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n", proc->pid, thread->pid, (u64)extra_buffers_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } off_start_offset = ALIGN(tr->data_size, sizeof(void *)); buffer_offset = off_start_offset; off_end_offset = off_start_offset + tr->offsets_size; sg_buf_offset = ALIGN(off_end_offset, sizeof(void *)); sg_buf_end_offset = sg_buf_offset + extra_buffers_size - ALIGN(secctx_sz, sizeof(u64)); off_min = 0; for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; size_t object_size; struct binder_object object; binder_size_t object_offset; binder_size_t copy_size; if (binder_alloc_copy_from_buffer(&target_proc->alloc, &object_offset, t->buffer, buffer_offset, sizeof(object_offset))) { binder_txn_error("%d:%d copy offset from buffer failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } /* * Copy the source user buffer up to the next object * that will be processed. */ copy_size = object_offset - user_offset; if (copy_size && (user_offset > object_offset || binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, user_offset, user_buffer + user_offset, copy_size))) { binder_user_error("%d:%d got transaction with invalid data ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } object_size = binder_get_object(target_proc, user_buffer, t->buffer, object_offset, &object); if (object_size == 0 || object_offset < off_min) { binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n", proc->pid, thread->pid, (u64)object_offset, (u64)off_min, (u64)t->buffer->data_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } /* * Set offset to the next buffer fragment to be * copied */ user_offset = object_offset + object_size; hdr = &object.hdr; off_min = object_offset + object_size; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { struct flat_binder_object *fp; fp = to_flat_binder_object(hdr); ret = binder_translate_binder(fp, t, thread); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate binder failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { struct flat_binder_object *fp; fp = to_flat_binder_object(hdr); ret = binder_translate_handle(fp, t, thread); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate handle failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_FD: { struct binder_fd_object *fp = to_binder_fd_object(hdr); binder_size_t fd_offset = object_offset + (uintptr_t)&fp->fd - (uintptr_t)fp; int ret = binder_translate_fd(fp->fd, fd_offset, t, thread, in_reply_to); fp->pad_binder = 0; if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate fd failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_FDA: { struct binder_object ptr_object; binder_size_t parent_offset; struct binder_object user_object; size_t user_parent_size; struct binder_fd_array_object *fda = to_binder_fd_array_object(hdr); size_t num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); struct binder_buffer_object *parent = binder_validate_ptr(target_proc, t->buffer, &ptr_object, fda->parent, off_start_offset, &parent_offset, num_valid); if (!parent) { binder_user_error("%d:%d got transaction with invalid parent offset or type\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } if (!binder_validate_fixup(target_proc, t->buffer, off_start_offset, parent_offset, fda->parent_offset, last_fixup_obj_off, last_fixup_min_off)) { binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } /* * We need to read the user version of the parent * object to get the original user offset */ user_parent_size = binder_get_object(proc, user_buffer, t->buffer, parent_offset, &user_object); if (user_parent_size != sizeof(user_object.bbo)) { binder_user_error("%d:%d invalid ptr object size: %zd vs %zd\n", proc->pid, thread->pid, user_parent_size, sizeof(user_object.bbo)); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } ret = binder_translate_fd_array(&pf_head, fda, user_buffer, parent, &user_object.bbo, t, thread, in_reply_to); if (!ret) ret = binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fda, sizeof(*fda)); if (ret) { binder_txn_error("%d:%d translate fd array failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret > 0 ? -EINVAL : ret; return_error_line = __LINE__; goto err_translate_failed; } last_fixup_obj_off = parent_offset; last_fixup_min_off = fda->parent_offset + sizeof(u32) * fda->num_fds; } break; case BINDER_TYPE_PTR: { struct binder_buffer_object *bp = to_binder_buffer_object(hdr); size_t buf_left = sg_buf_end_offset - sg_buf_offset; size_t num_valid; if (bp->length > buf_left) { binder_user_error("%d:%d got transaction with too large buffer\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } ret = binder_defer_copy(&sgc_head, sg_buf_offset, (const void __user *)(uintptr_t)bp->buffer, bp->length); if (ret) { binder_txn_error("%d:%d deferred copy failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } /* Fixup buffer pointer to target proc address space */ bp->buffer = t->buffer->user_data + sg_buf_offset; sg_buf_offset += ALIGN(bp->length, sizeof(u64)); num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); ret = binder_fixup_parent(&pf_head, t, thread, bp, off_start_offset, num_valid, last_fixup_obj_off, last_fixup_min_off); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, bp, sizeof(*bp))) { binder_txn_error("%d:%d failed to fixup parent\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } last_fixup_obj_off = object_offset; last_fixup_min_off = 0; } break; default: binder_user_error("%d:%d got transaction with invalid object type, %x\n", proc->pid, thread->pid, hdr->type); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_object_type; } } /* Done processing objects, copy the rest of the buffer */ if (binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, user_offset, user_buffer + user_offset, tr->data_size - user_offset)) { binder_user_error("%d:%d got transaction with invalid data ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } ret = binder_do_deferred_txn_copies(&target_proc->alloc, t->buffer, &sgc_head, &pf_head); if (ret) { binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_copy_data_failed; } if (t->buffer->oneway_spam_suspect) tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT; else tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; t->work.type = BINDER_WORK_TRANSACTION; if (reply) { binder_enqueue_thread_work(thread, tcomplete); binder_inner_proc_lock(target_proc); if (target_thread->is_dead) { return_error = BR_DEAD_REPLY; binder_inner_proc_unlock(target_proc); goto err_dead_proc_or_thread; } BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction_ilocked(target_thread, in_reply_to); binder_enqueue_thread_work_ilocked(target_thread, &t->work); target_proc->outstanding_txns++; binder_inner_proc_unlock(target_proc); wake_up_interruptible_sync(&target_thread->wait); binder_free_transaction(in_reply_to); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); binder_inner_proc_lock(proc); /* * Defer the TRANSACTION_COMPLETE, so we don't return to * userspace immediately; this allows the target process to * immediately start processing this transaction, reducing * latency. We will then return the TRANSACTION_COMPLETE when * the target replies (or there is an error). */ binder_enqueue_deferred_thread_work_ilocked(thread, tcomplete); t->need_reply = 1; t->from_parent = thread->transaction_stack; thread->transaction_stack = t; binder_inner_proc_unlock(proc); return_error = binder_proc_transaction(t, target_proc, target_thread); if (return_error) { binder_inner_proc_lock(proc); binder_pop_transaction_ilocked(thread, t); binder_inner_proc_unlock(proc); goto err_dead_proc_or_thread; } } else { BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); return_error = binder_proc_transaction(t, target_proc, NULL); /* * Let the caller know when async transaction reaches a frozen * process and is put in a pending queue, waiting for the target * process to be unfrozen. */ if (return_error == BR_TRANSACTION_PENDING_FROZEN) tcomplete->type = BINDER_WORK_TRANSACTION_PENDING; binder_enqueue_thread_work(thread, tcomplete); if (return_error && return_error != BR_TRANSACTION_PENDING_FROZEN) goto err_dead_proc_or_thread; } if (target_thread) binder_thread_dec_tmpref(target_thread); binder_proc_dec_tmpref(target_proc); if (target_node) binder_dec_node_tmpref(target_node); /* * write barrier to synchronize with initialization * of log entry */ smp_wmb(); WRITE_ONCE(e->debug_id_done, t_debug_id); return; err_dead_proc_or_thread: binder_txn_error("%d:%d dead process or thread\n", thread->pid, proc->pid); return_error_line = __LINE__; binder_dequeue_work(proc, tcomplete); err_translate_failed: err_bad_object_type: err_bad_offset: err_bad_parent: err_copy_data_failed: binder_cleanup_deferred_txn_lists(&sgc_head, &pf_head); binder_free_txn_fixups(t); trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, NULL, t->buffer, buffer_offset, true); if (target_node) binder_dec_node_tmpref(target_node); target_node = NULL; t->buffer->transaction = NULL; binder_alloc_free_buf(&target_proc->alloc, t->buffer); err_binder_alloc_buf_failed: err_bad_extra_size: if (secctx) security_release_secctx(secctx, secctx_sz); err_get_secctx_failed: kfree(tcomplete); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); err_alloc_tcomplete_failed: if (trace_binder_txn_latency_free_enabled()) binder_txn_latency_free(t); kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); err_alloc_t_failed: err_bad_todo_list: err_bad_call_stack: err_empty_call_stack: err_dead_binder: err_invalid_target_handle: if (target_node) { binder_dec_node(target_node, 1, 0); binder_dec_node_tmpref(target_node); } binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d transaction %s to %d:%d failed %d/%d/%d, size %lld-%lld line %d\n", proc->pid, thread->pid, reply ? "reply" : (tr->flags & TF_ONE_WAY ? "async" : "call"), target_proc ? target_proc->pid : 0, target_thread ? target_thread->pid : 0, t_debug_id, return_error, return_error_param, (u64)tr->data_size, (u64)tr->offsets_size, return_error_line); if (target_thread) binder_thread_dec_tmpref(target_thread); if (target_proc) binder_proc_dec_tmpref(target_proc); { struct binder_transaction_log_entry *fe; e->return_error = return_error; e->return_error_param = return_error_param; e->return_error_line = return_error_line; fe = binder_transaction_log_add(&binder_transaction_log_failed); *fe = *e; /* * write barrier to synchronize with initialization * of log entry */ smp_wmb(); WRITE_ONCE(e->debug_id_done, t_debug_id); WRITE_ONCE(fe->debug_id_done, t_debug_id); } BUG_ON(thread->return_error.cmd != BR_OK); if (in_reply_to) { binder_set_txn_from_error(in_reply_to, t_debug_id, return_error, return_error_param); thread->return_error.cmd = BR_TRANSACTION_COMPLETE; binder_enqueue_thread_work(thread, &thread->return_error.work); binder_send_failed_reply(in_reply_to, return_error); } else { binder_inner_proc_lock(proc); binder_set_extended_error(&thread->ee, t_debug_id, return_error, return_error_param); binder_inner_proc_unlock(proc); thread->return_error.cmd = return_error; binder_enqueue_thread_work(thread, &thread->return_error.work); } } /** * binder_free_buf() - free the specified buffer * @proc: binder proc that owns buffer * @buffer: buffer to be freed * @is_failure: failed to send transaction * * If buffer for an async transaction, enqueue the next async * transaction from the node. * * Cleanup buffer and free it. */ static void binder_free_buf(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, bool is_failure) { binder_inner_proc_lock(proc); if (buffer->transaction) { buffer->transaction->buffer = NULL; buffer->transaction = NULL; } binder_inner_proc_unlock(proc); if (buffer->async_transaction && buffer->target_node) { struct binder_node *buf_node; struct binder_work *w; buf_node = buffer->target_node; binder_node_inner_lock(buf_node); BUG_ON(!buf_node->has_async_transaction); BUG_ON(buf_node->proc != proc); w = binder_dequeue_work_head_ilocked( &buf_node->async_todo); if (!w) { buf_node->has_async_transaction = false; } else { binder_enqueue_work_ilocked( w, &proc->todo); binder_wakeup_proc_ilocked(proc); } binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); binder_release_entire_buffer(proc, thread, buffer, is_failure); binder_alloc_free_buf(&proc->alloc, buffer); } static int binder_thread_write(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, binder_size_t *consumed) { uint32_t cmd; struct binder_context *context = proc->context; void __user *buffer = (void __user *)(uintptr_t)binder_buffer; void __user *ptr = buffer + *consumed; void __user *end = buffer + size; while (ptr < end && thread->return_error.cmd == BR_OK) { int ret; if (get_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); trace_binder_command(cmd); if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) { atomic_inc(&binder_stats.bc[_IOC_NR(cmd)]); atomic_inc(&proc->stats.bc[_IOC_NR(cmd)]); atomic_inc(&thread->stats.bc[_IOC_NR(cmd)]); } switch (cmd) { case BC_INCREFS: case BC_ACQUIRE: case BC_RELEASE: case BC_DECREFS: { uint32_t target; const char *debug_string; bool strong = cmd == BC_ACQUIRE || cmd == BC_RELEASE; bool increment = cmd == BC_INCREFS || cmd == BC_ACQUIRE; struct binder_ref_data rdata; if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); ret = -1; if (increment && !target) { struct binder_node *ctx_mgr_node; mutex_lock(&context->context_mgr_node_lock); ctx_mgr_node = context->binder_context_mgr_node; if (ctx_mgr_node) { if (ctx_mgr_node->proc == proc) { binder_user_error("%d:%d context manager tried to acquire desc 0\n", proc->pid, thread->pid); mutex_unlock(&context->context_mgr_node_lock); return -EINVAL; } ret = binder_inc_ref_for_node( proc, ctx_mgr_node, strong, NULL, &rdata); } mutex_unlock(&context->context_mgr_node_lock); } if (ret) ret = binder_update_ref_for_handle( proc, target, increment, strong, &rdata); if (!ret && rdata.desc != target) { binder_user_error("%d:%d tried to acquire reference to desc %d, got %d instead\n", proc->pid, thread->pid, target, rdata.desc); } switch (cmd) { case BC_INCREFS: debug_string = "IncRefs"; break; case BC_ACQUIRE: debug_string = "Acquire"; break; case BC_RELEASE: debug_string = "Release"; break; case BC_DECREFS: default: debug_string = "DecRefs"; break; } if (ret) { binder_user_error("%d:%d %s %d refcount change on invalid ref %d ret %d\n", proc->pid, thread->pid, debug_string, strong, target, ret); break; } binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s ref %d desc %d s %d w %d\n", proc->pid, thread->pid, debug_string, rdata.debug_id, rdata.desc, rdata.strong, rdata.weak); break; } case BC_INCREFS_DONE: case BC_ACQUIRE_DONE: { binder_uintptr_t node_ptr; binder_uintptr_t cookie; struct binder_node *node; bool free_node; if (get_user(node_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); node = binder_get_node(proc, node_ptr); if (node == NULL) { binder_user_error("%d:%d %s u%016llx no match\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", (u64)node_ptr); break; } if (cookie != node->cookie) { binder_user_error("%d:%d %s u%016llx node %d cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", (u64)node_ptr, node->debug_id, (u64)cookie, (u64)node->cookie); binder_put_node(node); break; } binder_node_inner_lock(node); if (cmd == BC_ACQUIRE_DONE) { if (node->pending_strong_ref == 0) { binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n", proc->pid, thread->pid, node->debug_id); binder_node_inner_unlock(node); binder_put_node(node); break; } node->pending_strong_ref = 0; } else { if (node->pending_weak_ref == 0) { binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n", proc->pid, thread->pid, node->debug_id); binder_node_inner_unlock(node); binder_put_node(node); break; } node->pending_weak_ref = 0; } free_node = binder_dec_node_nilocked(node, cmd == BC_ACQUIRE_DONE, 0); WARN_ON(free_node); binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s node %d ls %d lw %d tr %d\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", node->debug_id, node->local_strong_refs, node->local_weak_refs, node->tmp_refs); binder_node_inner_unlock(node); binder_put_node(node); break; } case BC_ATTEMPT_ACQUIRE: pr_err("BC_ATTEMPT_ACQUIRE not supported\n"); return -EINVAL; case BC_ACQUIRE_RESULT: pr_err("BC_ACQUIRE_RESULT not supported\n"); return -EINVAL; case BC_FREE_BUFFER: { binder_uintptr_t data_ptr; struct binder_buffer *buffer; if (get_user(data_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); buffer = binder_alloc_prepare_to_free(&proc->alloc, data_ptr); if (IS_ERR_OR_NULL(buffer)) { if (PTR_ERR(buffer) == -EPERM) { binder_user_error( "%d:%d BC_FREE_BUFFER u%016llx matched unreturned or currently freeing buffer\n", proc->pid, thread->pid, (u64)data_ptr); } else { binder_user_error( "%d:%d BC_FREE_BUFFER u%016llx no match\n", proc->pid, thread->pid, (u64)data_ptr); } break; } binder_debug(BINDER_DEBUG_FREE_BUFFER, "%d:%d BC_FREE_BUFFER u%016llx found buffer %d for %s transaction\n", proc->pid, thread->pid, (u64)data_ptr, buffer->debug_id, buffer->transaction ? "active" : "finished"); binder_free_buf(proc, thread, buffer, false); break; } case BC_TRANSACTION_SG: case BC_REPLY_SG: { struct binder_transaction_data_sg tr; if (copy_from_user(&tr, ptr, sizeof(tr))) return -EFAULT; ptr += sizeof(tr); binder_transaction(proc, thread, &tr.transaction_data, cmd == BC_REPLY_SG, tr.buffers_size); break; } case BC_TRANSACTION: case BC_REPLY: { struct binder_transaction_data tr; if (copy_from_user(&tr, ptr, sizeof(tr))) return -EFAULT; ptr += sizeof(tr); binder_transaction(proc, thread, &tr, cmd == BC_REPLY, 0); break; } case BC_REGISTER_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_REGISTER_LOOPER\n", proc->pid, thread->pid); binder_inner_proc_lock(proc); if (thread->looper & BINDER_LOOPER_STATE_ENTERED) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n", proc->pid, thread->pid); } else if (proc->requested_threads == 0) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called without request\n", proc->pid, thread->pid); } else { proc->requested_threads--; proc->requested_threads_started++; } thread->looper |= BINDER_LOOPER_STATE_REGISTERED; binder_inner_proc_unlock(proc); break; case BC_ENTER_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_ENTER_LOOPER\n", proc->pid, thread->pid); if (thread->looper & BINDER_LOOPER_STATE_REGISTERED) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_ENTER_LOOPER called after BC_REGISTER_LOOPER\n", proc->pid, thread->pid); } thread->looper |= BINDER_LOOPER_STATE_ENTERED; break; case BC_EXIT_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_EXIT_LOOPER\n", proc->pid, thread->pid); thread->looper |= BINDER_LOOPER_STATE_EXITED; break; case BC_REQUEST_DEATH_NOTIFICATION: case BC_CLEAR_DEATH_NOTIFICATION: { uint32_t target; binder_uintptr_t cookie; struct binder_ref *ref; struct binder_ref_death *death = NULL; if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { /* * Allocate memory for death notification * before taking lock */ death = kzalloc(sizeof(*death), GFP_KERNEL); if (death == NULL) { WARN_ON(thread->return_error.cmd != BR_OK); thread->return_error.cmd = BR_ERROR; binder_enqueue_thread_work( thread, &thread->return_error.work); binder_debug( BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n", proc->pid, thread->pid); break; } } binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, target, false); if (ref == NULL) { binder_user_error("%d:%d %s invalid ref %d\n", proc->pid, thread->pid, cmd == BC_REQUEST_DEATH_NOTIFICATION ? "BC_REQUEST_DEATH_NOTIFICATION" : "BC_CLEAR_DEATH_NOTIFICATION", target); binder_proc_unlock(proc); kfree(death); break; } binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, "%d:%d %s %016llx ref %d desc %d s %d w %d for node %d\n", proc->pid, thread->pid, cmd == BC_REQUEST_DEATH_NOTIFICATION ? "BC_REQUEST_DEATH_NOTIFICATION" : "BC_CLEAR_DEATH_NOTIFICATION", (u64)cookie, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak, ref->node->debug_id); binder_node_lock(ref->node); if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { if (ref->death) { binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); kfree(death); break; } binder_stats_created(BINDER_STAT_DEATH); INIT_LIST_HEAD(&death->work.entry); death->cookie = cookie; ref->death = death; if (ref->node->proc == NULL) { ref->death->work.type = BINDER_WORK_DEAD_BINDER; binder_inner_proc_lock(proc); binder_enqueue_work_ilocked( &ref->death->work, &proc->todo); binder_wakeup_proc_ilocked(proc); binder_inner_proc_unlock(proc); } } else { if (ref->death == NULL) { binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); break; } death = ref->death; if (death->cookie != cookie) { binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, (u64)death->cookie, (u64)cookie); binder_node_unlock(ref->node); binder_proc_unlock(proc); break; } ref->death = NULL; binder_inner_proc_lock(proc); if (list_empty(&death->work.entry)) { death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) binder_enqueue_thread_work_ilocked( thread, &death->work); else { binder_enqueue_work_ilocked( &death->work, &proc->todo); binder_wakeup_proc_ilocked( proc); } } else { BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER); death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR; } binder_inner_proc_unlock(proc); } binder_node_unlock(ref->node); binder_proc_unlock(proc); } break; case BC_DEAD_BINDER_DONE: { struct binder_work *w; binder_uintptr_t cookie; struct binder_ref_death *death = NULL; if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(cookie); binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->delivered_death, entry) { struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work); if (tmp_death->cookie == cookie) { death = tmp_death; break; } } binder_debug(BINDER_DEBUG_DEAD_BINDER, "%d:%d BC_DEAD_BINDER_DONE %016llx found %pK\n", proc->pid, thread->pid, (u64)cookie, death); if (death == NULL) { binder_user_error("%d:%d BC_DEAD_BINDER_DONE %016llx not found\n", proc->pid, thread->pid, (u64)cookie); binder_inner_proc_unlock(proc); break; } binder_dequeue_work_ilocked(&death->work); if (death->work.type == BINDER_WORK_DEAD_BINDER_AND_CLEAR) { death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) binder_enqueue_thread_work_ilocked( thread, &death->work); else { binder_enqueue_work_ilocked( &death->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } } binder_inner_proc_unlock(proc); } break; default: pr_err("%d:%d unknown command %u\n", proc->pid, thread->pid, cmd); return -EINVAL; } *consumed = ptr - buffer; } return 0; } static void binder_stat_br(struct binder_proc *proc, struct binder_thread *thread, uint32_t cmd) { trace_binder_return(cmd); if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) { atomic_inc(&binder_stats.br[_IOC_NR(cmd)]); atomic_inc(&proc->stats.br[_IOC_NR(cmd)]); atomic_inc(&thread->stats.br[_IOC_NR(cmd)]); } } static int binder_put_node_cmd(struct binder_proc *proc, struct binder_thread *thread, void __user **ptrp, binder_uintptr_t node_ptr, binder_uintptr_t node_cookie, int node_debug_id, uint32_t cmd, const char *cmd_name) { void __user *ptr = *ptrp; if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (put_user(node_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (put_user(node_cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s %d u%016llx c%016llx\n", proc->pid, thread->pid, cmd_name, node_debug_id, (u64)node_ptr, (u64)node_cookie); *ptrp = ptr; return 0; } static int binder_wait_for_work(struct binder_thread *thread, bool do_proc_work) { DEFINE_WAIT(wait); struct binder_proc *proc = thread->proc; int ret = 0; binder_inner_proc_lock(proc); for (;;) { prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE); if (binder_has_work_ilocked(thread, do_proc_work)) break; if (do_proc_work) list_add(&thread->waiting_thread_node, &proc->waiting_threads); binder_inner_proc_unlock(proc); schedule(); binder_inner_proc_lock(proc); list_del_init(&thread->waiting_thread_node); if (signal_pending(current)) { ret = -EINTR; break; } } finish_wait(&thread->wait, &wait); binder_inner_proc_unlock(proc); return ret; } /** * binder_apply_fd_fixups() - finish fd translation * @proc: binder_proc associated @t->buffer * @t: binder transaction with list of fd fixups * * Now that we are in the context of the transaction target * process, we can allocate and install fds. Process the * list of fds to translate and fixup the buffer with the * new fds first and only then install the files. * * If we fail to allocate an fd, skip the install and release * any fds that have already been allocated. */ static int binder_apply_fd_fixups(struct binder_proc *proc, struct binder_transaction *t) { struct binder_txn_fd_fixup *fixup, *tmp; int ret = 0; list_for_each_entry(fixup, &t->fd_fixups, fixup_entry) { int fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { binder_debug(BINDER_DEBUG_TRANSACTION, "failed fd fixup txn %d fd %d\n", t->debug_id, fd); ret = -ENOMEM; goto err; } binder_debug(BINDER_DEBUG_TRANSACTION, "fd fixup txn %d fd %d\n", t->debug_id, fd); trace_binder_transaction_fd_recv(t, fd, fixup->offset); fixup->target_fd = fd; if (binder_alloc_copy_to_buffer(&proc->alloc, t->buffer, fixup->offset, &fd, sizeof(u32))) { ret = -EINVAL; goto err; } } list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { fd_install(fixup->target_fd, fixup->file); list_del(&fixup->fixup_entry); kfree(fixup); } return ret; err: binder_free_txn_fixups(t); return ret; } static int binder_thread_read(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, binder_size_t *consumed, int non_block) { void __user *buffer = (void __user *)(uintptr_t)binder_buffer; void __user *ptr = buffer + *consumed; void __user *end = buffer + size; int ret = 0; int wait_for_proc_work; if (*consumed == 0) { if (put_user(BR_NOOP, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); } retry: binder_inner_proc_lock(proc); wait_for_proc_work = binder_available_for_proc_work_ilocked(thread); binder_inner_proc_unlock(proc); thread->looper |= BINDER_LOOPER_STATE_WAITING; trace_binder_wait_for_work(wait_for_proc_work, !!thread->transaction_stack, !binder_worklist_empty(proc, &thread->todo)); if (wait_for_proc_work) { if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED))) { binder_user_error("%d:%d ERROR: Thread waiting for process work before calling BC_REGISTER_LOOPER or BC_ENTER_LOOPER (state %x)\n", proc->pid, thread->pid, thread->looper); wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); } binder_set_nice(proc->default_priority); } if (non_block) { if (!binder_has_work(thread, wait_for_proc_work)) ret = -EAGAIN; } else { ret = binder_wait_for_work(thread, wait_for_proc_work); } thread->looper &= ~BINDER_LOOPER_STATE_WAITING; if (ret) return ret; while (1) { uint32_t cmd; struct binder_transaction_data_secctx tr; struct binder_transaction_data *trd = &tr.transaction_data; struct binder_work *w = NULL; struct list_head *list = NULL; struct binder_transaction *t = NULL; struct binder_thread *t_from; size_t trsize = sizeof(*trd); binder_inner_proc_lock(proc); if (!binder_worklist_empty_ilocked(&thread->todo)) list = &thread->todo; else if (!binder_worklist_empty_ilocked(&proc->todo) && wait_for_proc_work) list = &proc->todo; else { binder_inner_proc_unlock(proc); /* no data added */ if (ptr - buffer == 4 && !thread->looper_need_return) goto retry; break; } if (end - ptr < sizeof(tr) + 4) { binder_inner_proc_unlock(proc); break; } w = binder_dequeue_work_head_ilocked(list); if (binder_worklist_empty_ilocked(&thread->todo)) thread->process_todo = false; switch (w->type) { case BINDER_WORK_TRANSACTION: { binder_inner_proc_unlock(proc); t = container_of(w, struct binder_transaction, work); } break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( w, struct binder_error, work); WARN_ON(e->cmd == BR_OK); binder_inner_proc_unlock(proc); if (put_user(e->cmd, (uint32_t __user *)ptr)) return -EFAULT; cmd = e->cmd; e->cmd = BR_OK; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); } break; case BINDER_WORK_TRANSACTION_COMPLETE: case BINDER_WORK_TRANSACTION_PENDING: case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: { if (proc->oneway_spam_detection_enabled && w->type == BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT) cmd = BR_ONEWAY_SPAM_SUSPECT; else if (w->type == BINDER_WORK_TRANSACTION_PENDING) cmd = BR_TRANSACTION_PENDING_FROZEN; else cmd = BR_TRANSACTION_COMPLETE; binder_inner_proc_unlock(proc); kfree(w); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE, "%d:%d BR_TRANSACTION_COMPLETE\n", proc->pid, thread->pid); } break; case BINDER_WORK_NODE: { struct binder_node *node = container_of(w, struct binder_node, work); int strong, weak; binder_uintptr_t node_ptr = node->ptr; binder_uintptr_t node_cookie = node->cookie; int node_debug_id = node->debug_id; int has_weak_ref; int has_strong_ref; void __user *orig_ptr = ptr; BUG_ON(proc != node->proc); strong = node->internal_strong_refs || node->local_strong_refs; weak = !hlist_empty(&node->refs) || node->local_weak_refs || node->tmp_refs || strong; has_strong_ref = node->has_strong_ref; has_weak_ref = node->has_weak_ref; if (weak && !has_weak_ref) { node->has_weak_ref = 1; node->pending_weak_ref = 1; node->local_weak_refs++; } if (strong && !has_strong_ref) { node->has_strong_ref = 1; node->pending_strong_ref = 1; node->local_strong_refs++; } if (!strong && has_strong_ref) node->has_strong_ref = 0; if (!weak && has_weak_ref) node->has_weak_ref = 0; if (!weak && !strong) { binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx deleted\n", proc->pid, thread->pid, node_debug_id, (u64)node_ptr, (u64)node_cookie); rb_erase(&node->rb_node, &proc->nodes); binder_inner_proc_unlock(proc); binder_node_lock(node); /* * Acquire the node lock before freeing the * node to serialize with other threads that * may have been holding the node lock while * decrementing this node (avoids race where * this thread frees while the other thread * is unlocking the node after the final * decrement) */ binder_node_unlock(node); binder_free_node(node); } else binder_inner_proc_unlock(proc); if (weak && !has_weak_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_INCREFS, "BR_INCREFS"); if (!ret && strong && !has_strong_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_ACQUIRE, "BR_ACQUIRE"); if (!ret && !strong && has_strong_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_RELEASE, "BR_RELEASE"); if (!ret && !weak && has_weak_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_DECREFS, "BR_DECREFS"); if (orig_ptr == ptr) binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx state unchanged\n", proc->pid, thread->pid, node_debug_id, (u64)node_ptr, (u64)node_cookie); if (ret) return ret; } break; case BINDER_WORK_DEAD_BINDER: case BINDER_WORK_DEAD_BINDER_AND_CLEAR: case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { struct binder_ref_death *death; uint32_t cmd; binder_uintptr_t cookie; death = container_of(w, struct binder_ref_death, work); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE; else cmd = BR_DEAD_BINDER; cookie = death->cookie; binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, "%d:%d %s %016llx\n", proc->pid, thread->pid, cmd == BR_DEAD_BINDER ? "BR_DEAD_BINDER" : "BR_CLEAR_DEATH_NOTIFICATION_DONE", (u64)cookie); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) { binder_inner_proc_unlock(proc); kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } else { binder_enqueue_work_ilocked( w, &proc->delivered_death); binder_inner_proc_unlock(proc); } if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (put_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); binder_stat_br(proc, thread, cmd); if (cmd == BR_DEAD_BINDER) goto done; /* DEAD_BINDER notifications can cause transactions */ } break; default: binder_inner_proc_unlock(proc); pr_err("%d:%d: bad work type %d\n", proc->pid, thread->pid, w->type); break; } if (!t) continue; BUG_ON(t->buffer == NULL); if (t->buffer->target_node) { struct binder_node *target_node = t->buffer->target_node; trd->target.ptr = target_node->ptr; trd->cookie = target_node->cookie; t->saved_priority = task_nice(current); if (t->priority < target_node->min_priority && !(t->flags & TF_ONE_WAY)) binder_set_nice(t->priority); else if (!(t->flags & TF_ONE_WAY) || t->saved_priority > target_node->min_priority) binder_set_nice(target_node->min_priority); cmd = BR_TRANSACTION; } else { trd->target.ptr = 0; trd->cookie = 0; cmd = BR_REPLY; } trd->code = t->code; trd->flags = t->flags; trd->sender_euid = from_kuid(current_user_ns(), t->sender_euid); t_from = binder_get_txn_from(t); if (t_from) { struct task_struct *sender = t_from->proc->tsk; trd->sender_pid = task_tgid_nr_ns(sender, task_active_pid_ns(current)); } else { trd->sender_pid = 0; } ret = binder_apply_fd_fixups(proc, t); if (ret) { struct binder_buffer *buffer = t->buffer; bool oneway = !!(t->flags & TF_ONE_WAY); int tid = t->debug_id; if (t_from) binder_thread_dec_tmpref(t_from); buffer->transaction = NULL; binder_cleanup_transaction(t, "fd fixups failed", BR_FAILED_REPLY); binder_free_buf(proc, thread, buffer, true); binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n", proc->pid, thread->pid, oneway ? "async " : (cmd == BR_REPLY ? "reply " : ""), tid, BR_FAILED_REPLY, ret, __LINE__); if (cmd == BR_REPLY) { cmd = BR_FAILED_REPLY; if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); break; } continue; } trd->data_size = t->buffer->data_size; trd->offsets_size = t->buffer->offsets_size; trd->data.ptr.buffer = t->buffer->user_data; trd->data.ptr.offsets = trd->data.ptr.buffer + ALIGN(t->buffer->data_size, sizeof(void *)); tr.secctx = t->security_ctx; if (t->security_ctx) { cmd = BR_TRANSACTION_SEC_CTX; trsize = sizeof(tr); } if (put_user(cmd, (uint32_t __user *)ptr)) { if (t_from) binder_thread_dec_tmpref(t_from); binder_cleanup_transaction(t, "put_user failed", BR_FAILED_REPLY); return -EFAULT; } ptr += sizeof(uint32_t); if (copy_to_user(ptr, &tr, trsize)) { if (t_from) binder_thread_dec_tmpref(t_from); binder_cleanup_transaction(t, "copy_to_user failed", BR_FAILED_REPLY); return -EFAULT; } ptr += trsize; trace_binder_transaction_received(t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d %s %d %d:%d, cmd %u size %zd-%zd ptr %016llx-%016llx\n", proc->pid, thread->pid, (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" : (cmd == BR_TRANSACTION_SEC_CTX) ? "BR_TRANSACTION_SEC_CTX" : "BR_REPLY", t->debug_id, t_from ? t_from->proc->pid : 0, t_from ? t_from->pid : 0, cmd, t->buffer->data_size, t->buffer->offsets_size, (u64)trd->data.ptr.buffer, (u64)trd->data.ptr.offsets); if (t_from) binder_thread_dec_tmpref(t_from); t->buffer->allow_user_free = 1; if (cmd != BR_REPLY && !(t->flags & TF_ONE_WAY)) { binder_inner_proc_lock(thread->proc); t->to_parent = thread->transaction_stack; t->to_thread = thread; thread->transaction_stack = t; binder_inner_proc_unlock(thread->proc); } else { binder_free_transaction(t); } break; } done: *consumed = ptr - buffer; binder_inner_proc_lock(proc); if (proc->requested_threads == 0 && list_empty(&thread->proc->waiting_threads) && proc->requested_threads_started < proc->max_threads && (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */ /*spawn a new thread if we leave this out */) { proc->requested_threads++; binder_inner_proc_unlock(proc); binder_debug(BINDER_DEBUG_THREADS, "%d:%d BR_SPAWN_LOOPER\n", proc->pid, thread->pid); if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer)) return -EFAULT; binder_stat_br(proc, thread, BR_SPAWN_LOOPER); } else binder_inner_proc_unlock(proc); return 0; } static void binder_release_work(struct binder_proc *proc, struct list_head *list) { struct binder_work *w; enum binder_work_type wtype; while (1) { binder_inner_proc_lock(proc); w = binder_dequeue_work_head_ilocked(list); wtype = w ? w->type : 0; binder_inner_proc_unlock(proc); if (!w) return; switch (wtype) { case BINDER_WORK_TRANSACTION: { struct binder_transaction *t; t = container_of(w, struct binder_transaction, work); binder_cleanup_transaction(t, "process died.", BR_DEAD_REPLY); } break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( w, struct binder_error, work); binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered TRANSACTION_ERROR: %u\n", e->cmd); } break; case BINDER_WORK_TRANSACTION_PENDING: case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: case BINDER_WORK_TRANSACTION_COMPLETE: { binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered TRANSACTION_COMPLETE\n"); kfree(w); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); } break; case BINDER_WORK_DEAD_BINDER_AND_CLEAR: case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { struct binder_ref_death *death; death = container_of(w, struct binder_ref_death, work); binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered death notification, %016llx\n", (u64)death->cookie); kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } break; case BINDER_WORK_NODE: break; default: pr_err("unexpected work type, %d, not freed\n", wtype); break; } } } static struct binder_thread *binder_get_thread_ilocked( struct binder_proc *proc, struct binder_thread *new_thread) { struct binder_thread *thread = NULL; struct rb_node *parent = NULL; struct rb_node **p = &proc->threads.rb_node; while (*p) { parent = *p; thread = rb_entry(parent, struct binder_thread, rb_node); if (current->pid < thread->pid) p = &(*p)->rb_left; else if (current->pid > thread->pid) p = &(*p)->rb_right; else return thread; } if (!new_thread) return NULL; thread = new_thread; binder_stats_created(BINDER_STAT_THREAD); thread->proc = proc; thread->pid = current->pid; atomic_set(&thread->tmp_ref, 0); init_waitqueue_head(&thread->wait); INIT_LIST_HEAD(&thread->todo); rb_link_node(&thread->rb_node, parent, p); rb_insert_color(&thread->rb_node, &proc->threads); thread->looper_need_return = true; thread->return_error.work.type = BINDER_WORK_RETURN_ERROR; thread->return_error.cmd = BR_OK; thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR; thread->reply_error.cmd = BR_OK; thread->ee.command = BR_OK; INIT_LIST_HEAD(&new_thread->waiting_thread_node); return thread; } static struct binder_thread *binder_get_thread(struct binder_proc *proc) { struct binder_thread *thread; struct binder_thread *new_thread; binder_inner_proc_lock(proc); thread = binder_get_thread_ilocked(proc, NULL); binder_inner_proc_unlock(proc); if (!thread) { new_thread = kzalloc(sizeof(*thread), GFP_KERNEL); if (new_thread == NULL) return NULL; binder_inner_proc_lock(proc); thread = binder_get_thread_ilocked(proc, new_thread); binder_inner_proc_unlock(proc); if (thread != new_thread) kfree(new_thread); } return thread; } static void binder_free_proc(struct binder_proc *proc) { struct binder_device *device; BUG_ON(!list_empty(&proc->todo)); BUG_ON(!list_empty(&proc->delivered_death)); if (proc->outstanding_txns) pr_warn("%s: Unexpected outstanding_txns %d\n", __func__, proc->outstanding_txns); device = container_of(proc->context, struct binder_device, context); if (refcount_dec_and_test(&device->ref)) { kfree(proc->context->name); kfree(device); } binder_alloc_deferred_release(&proc->alloc); put_task_struct(proc->tsk); put_cred(proc->cred); binder_stats_deleted(BINDER_STAT_PROC); kfree(proc); } static void binder_free_thread(struct binder_thread *thread) { BUG_ON(!list_empty(&thread->todo)); binder_stats_deleted(BINDER_STAT_THREAD); binder_proc_dec_tmpref(thread->proc); kfree(thread); } static int binder_thread_release(struct binder_proc *proc, struct binder_thread *thread) { struct binder_transaction *t; struct binder_transaction *send_reply = NULL; int active_transactions = 0; struct binder_transaction *last_t = NULL; binder_inner_proc_lock(thread->proc); /* * take a ref on the proc so it survives * after we remove this thread from proc->threads. * The corresponding dec is when we actually * free the thread in binder_free_thread() */ proc->tmp_ref++; /* * take a ref on this thread to ensure it * survives while we are releasing it */ atomic_inc(&thread->tmp_ref); rb_erase(&thread->rb_node, &proc->threads); t = thread->transaction_stack; if (t) { spin_lock(&t->lock); if (t->to_thread == thread) send_reply = t; } else { __acquire(&t->lock); } thread->is_dead = true; while (t) { last_t = t; active_transactions++; binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "release %d:%d transaction %d %s, still active\n", proc->pid, thread->pid, t->debug_id, (t->to_thread == thread) ? "in" : "out"); if (t->to_thread == thread) { thread->proc->outstanding_txns--; t->to_proc = NULL; t->to_thread = NULL; if (t->buffer) { t->buffer->transaction = NULL; t->buffer = NULL; } t = t->to_parent; } else if (t->from == thread) { t->from = NULL; t = t->from_parent; } else BUG(); spin_unlock(&last_t->lock); if (t) spin_lock(&t->lock); else __acquire(&t->lock); } /* annotation for sparse, lock not acquired in last iteration above */ __release(&t->lock); /* * If this thread used poll, make sure we remove the waitqueue from any * poll data structures holding it. */ if (thread->looper & BINDER_LOOPER_STATE_POLL) wake_up_pollfree(&thread->wait); binder_inner_proc_unlock(thread->proc); /* * This is needed to avoid races between wake_up_pollfree() above and * someone else removing the last entry from the queue for other reasons * (e.g. ep_remove_wait_queue() being called due to an epoll file * descriptor being closed). Such other users hold an RCU read lock, so * we can be sure they're done after we call synchronize_rcu(). */ if (thread->looper & BINDER_LOOPER_STATE_POLL) synchronize_rcu(); if (send_reply) binder_send_failed_reply(send_reply, BR_DEAD_REPLY); binder_release_work(proc, &thread->todo); binder_thread_dec_tmpref(thread); return active_transactions; } static __poll_t binder_poll(struct file *filp, struct poll_table_struct *wait) { struct binder_proc *proc = filp->private_data; struct binder_thread *thread = NULL; bool wait_for_proc_work; thread = binder_get_thread(proc); if (!thread) return EPOLLERR; binder_inner_proc_lock(thread->proc); thread->looper |= BINDER_LOOPER_STATE_POLL; wait_for_proc_work = binder_available_for_proc_work_ilocked(thread); binder_inner_proc_unlock(thread->proc); poll_wait(filp, &thread->wait, wait); if (binder_has_work(thread, wait_for_proc_work)) return EPOLLIN; return 0; } static int binder_ioctl_write_read(struct file *filp, unsigned long arg, struct binder_thread *thread) { int ret = 0; struct binder_proc *proc = filp->private_data; void __user *ubuf = (void __user *)arg; struct binder_write_read bwr; if (copy_from_user(&bwr, ubuf, sizeof(bwr))) { ret = -EFAULT; goto out; } binder_debug(BINDER_DEBUG_READ_WRITE, "%d:%d write %lld at %016llx, read %lld at %016llx\n", proc->pid, thread->pid, (u64)bwr.write_size, (u64)bwr.write_buffer, (u64)bwr.read_size, (u64)bwr.read_buffer); if (bwr.write_size > 0) { ret = binder_thread_write(proc, thread, bwr.write_buffer, bwr.write_size, &bwr.write_consumed); trace_binder_write_done(ret); if (ret < 0) { bwr.read_consumed = 0; if (copy_to_user(ubuf, &bwr, sizeof(bwr))) ret = -EFAULT; goto out; } } if (bwr.read_size > 0) { ret = binder_thread_read(proc, thread, bwr.read_buffer, bwr.read_size, &bwr.read_consumed, filp->f_flags & O_NONBLOCK); trace_binder_read_done(ret); binder_inner_proc_lock(proc); if (!binder_worklist_empty_ilocked(&proc->todo)) binder_wakeup_proc_ilocked(proc); binder_inner_proc_unlock(proc); if (ret < 0) { if (copy_to_user(ubuf, &bwr, sizeof(bwr))) ret = -EFAULT; goto out; } } binder_debug(BINDER_DEBUG_READ_WRITE, "%d:%d wrote %lld of %lld, read return %lld of %lld\n", proc->pid, thread->pid, (u64)bwr.write_consumed, (u64)bwr.write_size, (u64)bwr.read_consumed, (u64)bwr.read_size); if (copy_to_user(ubuf, &bwr, sizeof(bwr))) { ret = -EFAULT; goto out; } out: return ret; } static int binder_ioctl_set_ctx_mgr(struct file *filp, struct flat_binder_object *fbo) { int ret = 0; struct binder_proc *proc = filp->private_data; struct binder_context *context = proc->context; struct binder_node *new_node; kuid_t curr_euid = current_euid(); mutex_lock(&context->context_mgr_node_lock); if (context->binder_context_mgr_node) { pr_err("BINDER_SET_CONTEXT_MGR already set\n"); ret = -EBUSY; goto out; } ret = security_binder_set_context_mgr(proc->cred); if (ret < 0) goto out; if (uid_valid(context->binder_context_mgr_uid)) { if (!uid_eq(context->binder_context_mgr_uid, curr_euid)) { pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n", from_kuid(&init_user_ns, curr_euid), from_kuid(&init_user_ns, context->binder_context_mgr_uid)); ret = -EPERM; goto out; } } else { context->binder_context_mgr_uid = curr_euid; } new_node = binder_new_node(proc, fbo); if (!new_node) { ret = -ENOMEM; goto out; } binder_node_lock(new_node); new_node->local_weak_refs++; new_node->local_strong_refs++; new_node->has_strong_ref = 1; new_node->has_weak_ref = 1; context->binder_context_mgr_node = new_node; binder_node_unlock(new_node); binder_put_node(new_node); out: mutex_unlock(&context->context_mgr_node_lock); return ret; } static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc, struct binder_node_info_for_ref *info) { struct binder_node *node; struct binder_context *context = proc->context; __u32 handle = info->handle; if (info->strong_count || info->weak_count || info->reserved1 || info->reserved2 || info->reserved3) { binder_user_error("%d BINDER_GET_NODE_INFO_FOR_REF: only handle may be non-zero.", proc->pid); return -EINVAL; } /* This ioctl may only be used by the context manager */ mutex_lock(&context->context_mgr_node_lock); if (!context->binder_context_mgr_node || context->binder_context_mgr_node->proc != proc) { mutex_unlock(&context->context_mgr_node_lock); return -EPERM; } mutex_unlock(&context->context_mgr_node_lock); node = binder_get_node_from_ref(proc, handle, true, NULL); if (!node) return -EINVAL; info->strong_count = node->local_strong_refs + node->internal_strong_refs; info->weak_count = node->local_weak_refs; binder_put_node(node); return 0; } static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, struct binder_node_debug_info *info) { struct rb_node *n; binder_uintptr_t ptr = info->ptr; memset(info, 0, sizeof(*info)); binder_inner_proc_lock(proc); for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { struct binder_node *node = rb_entry(n, struct binder_node, rb_node); if (node->ptr > ptr) { info->ptr = node->ptr; info->cookie = node->cookie; info->has_strong_ref = node->has_strong_ref; info->has_weak_ref = node->has_weak_ref; break; } } binder_inner_proc_unlock(proc); return 0; } static bool binder_txns_pending_ilocked(struct binder_proc *proc) { struct rb_node *n; struct binder_thread *thread; if (proc->outstanding_txns > 0) return true; for (n = rb_first(&proc->threads); n; n = rb_next(n)) { thread = rb_entry(n, struct binder_thread, rb_node); if (thread->transaction_stack) return true; } return false; } static int binder_ioctl_freeze(struct binder_freeze_info *info, struct binder_proc *target_proc) { int ret = 0; if (!info->enable) { binder_inner_proc_lock(target_proc); target_proc->sync_recv = false; target_proc->async_recv = false; target_proc->is_frozen = false; binder_inner_proc_unlock(target_proc); return 0; } /* * Freezing the target. Prevent new transactions by * setting frozen state. If timeout specified, wait * for transactions to drain. */ binder_inner_proc_lock(target_proc); target_proc->sync_recv = false; target_proc->async_recv = false; target_proc->is_frozen = true; binder_inner_proc_unlock(target_proc); if (info->timeout_ms > 0) ret = wait_event_interruptible_timeout( target_proc->freeze_wait, (!target_proc->outstanding_txns), msecs_to_jiffies(info->timeout_ms)); /* Check pending transactions that wait for reply */ if (ret >= 0) { binder_inner_proc_lock(target_proc); if (binder_txns_pending_ilocked(target_proc)) ret = -EAGAIN; binder_inner_proc_unlock(target_proc); } if (ret < 0) { binder_inner_proc_lock(target_proc); target_proc->is_frozen = false; binder_inner_proc_unlock(target_proc); } return ret; } static int binder_ioctl_get_freezer_info( struct binder_frozen_status_info *info) { struct binder_proc *target_proc; bool found = false; __u32 txns_pending; info->sync_recv = 0; info->async_recv = 0; mutex_lock(&binder_procs_lock); hlist_for_each_entry(target_proc, &binder_procs, proc_node) { if (target_proc->pid == info->pid) { found = true; binder_inner_proc_lock(target_proc); txns_pending = binder_txns_pending_ilocked(target_proc); info->sync_recv |= target_proc->sync_recv | (txns_pending << 1); info->async_recv |= target_proc->async_recv; binder_inner_proc_unlock(target_proc); } } mutex_unlock(&binder_procs_lock); if (!found) return -EINVAL; return 0; } static int binder_ioctl_get_extended_error(struct binder_thread *thread, void __user *ubuf) { struct binder_extended_error ee; binder_inner_proc_lock(thread->proc); ee = thread->ee; binder_set_extended_error(&thread->ee, 0, BR_OK, 0); binder_inner_proc_unlock(thread->proc); if (copy_to_user(ubuf, &ee, sizeof(ee))) return -EFAULT; return 0; } static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret; struct binder_proc *proc = filp->private_data; struct binder_thread *thread; void __user *ubuf = (void __user *)arg; /*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/ binder_selftest_alloc(&proc->alloc); trace_binder_ioctl(cmd, arg); ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret) goto err_unlocked; thread = binder_get_thread(proc); if (thread == NULL) { ret = -ENOMEM; goto err; } switch (cmd) { case BINDER_WRITE_READ: ret = binder_ioctl_write_read(filp, arg, thread); if (ret) goto err; break; case BINDER_SET_MAX_THREADS: { int max_threads; if (copy_from_user(&max_threads, ubuf, sizeof(max_threads))) { ret = -EINVAL; goto err; } binder_inner_proc_lock(proc); proc->max_threads = max_threads; binder_inner_proc_unlock(proc); break; } case BINDER_SET_CONTEXT_MGR_EXT: { struct flat_binder_object fbo; if (copy_from_user(&fbo, ubuf, sizeof(fbo))) { ret = -EINVAL; goto err; } ret = binder_ioctl_set_ctx_mgr(filp, &fbo); if (ret) goto err; break; } case BINDER_SET_CONTEXT_MGR: ret = binder_ioctl_set_ctx_mgr(filp, NULL); if (ret) goto err; break; case BINDER_THREAD_EXIT: binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n", proc->pid, thread->pid); binder_thread_release(proc, thread); thread = NULL; break; case BINDER_VERSION: { struct binder_version __user *ver = ubuf; if (put_user(BINDER_CURRENT_PROTOCOL_VERSION, &ver->protocol_version)) { ret = -EINVAL; goto err; } break; } case BINDER_GET_NODE_INFO_FOR_REF: { struct binder_node_info_for_ref info; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } ret = binder_ioctl_get_node_info_for_ref(proc, &info); if (ret < 0) goto err; if (copy_to_user(ubuf, &info, sizeof(info))) { ret = -EFAULT; goto err; } break; } case BINDER_GET_NODE_DEBUG_INFO: { struct binder_node_debug_info info; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } ret = binder_ioctl_get_node_debug_info(proc, &info); if (ret < 0) goto err; if (copy_to_user(ubuf, &info, sizeof(info))) { ret = -EFAULT; goto err; } break; } case BINDER_FREEZE: { struct binder_freeze_info info; struct binder_proc **target_procs = NULL, *target_proc; int target_procs_count = 0, i = 0; ret = 0; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } mutex_lock(&binder_procs_lock); hlist_for_each_entry(target_proc, &binder_procs, proc_node) { if (target_proc->pid == info.pid) target_procs_count++; } if (target_procs_count == 0) { mutex_unlock(&binder_procs_lock); ret = -EINVAL; goto err; } target_procs = kcalloc(target_procs_count, sizeof(struct binder_proc *), GFP_KERNEL); if (!target_procs) { mutex_unlock(&binder_procs_lock); ret = -ENOMEM; goto err; } hlist_for_each_entry(target_proc, &binder_procs, proc_node) { if (target_proc->pid != info.pid) continue; binder_inner_proc_lock(target_proc); target_proc->tmp_ref++; binder_inner_proc_unlock(target_proc); target_procs[i++] = target_proc; } mutex_unlock(&binder_procs_lock); for (i = 0; i < target_procs_count; i++) { if (ret >= 0) ret = binder_ioctl_freeze(&info, target_procs[i]); binder_proc_dec_tmpref(target_procs[i]); } kfree(target_procs); if (ret < 0) goto err; break; } case BINDER_GET_FROZEN_INFO: { struct binder_frozen_status_info info; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } ret = binder_ioctl_get_freezer_info(&info); if (ret < 0) goto err; if (copy_to_user(ubuf, &info, sizeof(info))) { ret = -EFAULT; goto err; } break; } case BINDER_ENABLE_ONEWAY_SPAM_DETECTION: { uint32_t enable; if (copy_from_user(&enable, ubuf, sizeof(enable))) { ret = -EFAULT; goto err; } binder_inner_proc_lock(proc); proc->oneway_spam_detection_enabled = (bool)enable; binder_inner_proc_unlock(proc); break; } case BINDER_GET_EXTENDED_ERROR: ret = binder_ioctl_get_extended_error(thread, ubuf); if (ret < 0) goto err; break; default: ret = -EINVAL; goto err; } ret = 0; err: if (thread) thread->looper_need_return = false; wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret && ret != -EINTR) pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret); err_unlocked: trace_binder_ioctl_done(ret); return ret; } static void binder_vma_open(struct vm_area_struct *vma) { struct binder_proc *proc = vma->vm_private_data; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%d open vm area %lx-%lx (%ld K) vma %lx pagep %lx\n", proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); } static void binder_vma_close(struct vm_area_struct *vma) { struct binder_proc *proc = vma->vm_private_data; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%d close vm area %lx-%lx (%ld K) vma %lx pagep %lx\n", proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); binder_alloc_vma_close(&proc->alloc); } static vm_fault_t binder_vm_fault(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } static const struct vm_operations_struct binder_vm_ops = { .open = binder_vma_open, .close = binder_vma_close, .fault = binder_vm_fault, }; static int binder_mmap(struct file *filp, struct vm_area_struct *vma) { struct binder_proc *proc = filp->private_data; if (proc->tsk != current->group_leader) return -EINVAL; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d %lx-%lx (%ld K) vma %lx pagep %lx\n", __func__, proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) { pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, proc->pid, vma->vm_start, vma->vm_end, "bad vm_flags", -EPERM); return -EPERM; } vm_flags_mod(vma, VM_DONTCOPY | VM_MIXEDMAP, VM_MAYWRITE); vma->vm_ops = &binder_vm_ops; vma->vm_private_data = proc; return binder_alloc_mmap_handler(&proc->alloc, vma); } static int binder_open(struct inode *nodp, struct file *filp) { struct binder_proc *proc, *itr; struct binder_device *binder_dev; struct binderfs_info *info; struct dentry *binder_binderfs_dir_entry_proc = NULL; bool existing_pid = false; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__, current->group_leader->pid, current->pid); proc = kzalloc(sizeof(*proc), GFP_KERNEL); if (proc == NULL) return -ENOMEM; spin_lock_init(&proc->inner_lock); spin_lock_init(&proc->outer_lock); get_task_struct(current->group_leader); proc->tsk = current->group_leader; proc->cred = get_cred(filp->f_cred); INIT_LIST_HEAD(&proc->todo); init_waitqueue_head(&proc->freeze_wait); proc->default_priority = task_nice(current); /* binderfs stashes devices in i_private */ if (is_binderfs_device(nodp)) { binder_dev = nodp->i_private; info = nodp->i_sb->s_fs_info; binder_binderfs_dir_entry_proc = info->proc_log_dir; } else { binder_dev = container_of(filp->private_data, struct binder_device, miscdev); } refcount_inc(&binder_dev->ref); proc->context = &binder_dev->context; binder_alloc_init(&proc->alloc); binder_stats_created(BINDER_STAT_PROC); proc->pid = current->group_leader->pid; INIT_LIST_HEAD(&proc->delivered_death); INIT_LIST_HEAD(&proc->waiting_threads); filp->private_data = proc; mutex_lock(&binder_procs_lock); hlist_for_each_entry(itr, &binder_procs, proc_node) { if (itr->pid == proc->pid) { existing_pid = true; break; } } hlist_add_head(&proc->proc_node, &binder_procs); mutex_unlock(&binder_procs_lock); if (binder_debugfs_dir_entry_proc && !existing_pid) { char strbuf[11]; snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); /* * proc debug entries are shared between contexts. * Only create for the first PID to avoid debugfs log spamming * The printing code will anyway print all contexts for a given * PID so this is not a problem. */ proc->debugfs_entry = debugfs_create_file(strbuf, 0444, binder_debugfs_dir_entry_proc, (void *)(unsigned long)proc->pid, &proc_fops); } if (binder_binderfs_dir_entry_proc && !existing_pid) { char strbuf[11]; struct dentry *binderfs_entry; snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); /* * Similar to debugfs, the process specific log file is shared * between contexts. Only create for the first PID. * This is ok since same as debugfs, the log file will contain * information on all contexts of a given PID. */ binderfs_entry = binderfs_create_file(binder_binderfs_dir_entry_proc, strbuf, &proc_fops, (void *)(unsigned long)proc->pid); if (!IS_ERR(binderfs_entry)) { proc->binderfs_entry = binderfs_entry; } else { int error; error = PTR_ERR(binderfs_entry); pr_warn("Unable to create file %s in binderfs (error %d)\n", strbuf, error); } } return 0; } static int binder_flush(struct file *filp, fl_owner_t id) { struct binder_proc *proc = filp->private_data; binder_defer_work(proc, BINDER_DEFERRED_FLUSH); return 0; } static void binder_deferred_flush(struct binder_proc *proc) { struct rb_node *n; int wake_count = 0; binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node); thread->looper_need_return = true; if (thread->looper & BINDER_LOOPER_STATE_WAITING) { wake_up_interruptible(&thread->wait); wake_count++; } } binder_inner_proc_unlock(proc); binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_flush: %d woke %d threads\n", proc->pid, wake_count); } static int binder_release(struct inode *nodp, struct file *filp) { struct binder_proc *proc = filp->private_data; debugfs_remove(proc->debugfs_entry); if (proc->binderfs_entry) { binderfs_remove_file(proc->binderfs_entry); proc->binderfs_entry = NULL; } binder_defer_work(proc, BINDER_DEFERRED_RELEASE); return 0; } static int binder_node_release(struct binder_node *node, int refs) { struct binder_ref *ref; int death = 0; struct binder_proc *proc = node->proc; binder_release_work(proc, &node->async_todo); binder_node_lock(node); binder_inner_proc_lock(proc); binder_dequeue_work_ilocked(&node->work); /* * The caller must have taken a temporary ref on the node, */ BUG_ON(!node->tmp_refs); if (hlist_empty(&node->refs) && node->tmp_refs == 1) { binder_inner_proc_unlock(proc); binder_node_unlock(node); binder_free_node(node); return refs; } node->proc = NULL; node->local_strong_refs = 0; node->local_weak_refs = 0; binder_inner_proc_unlock(proc); spin_lock(&binder_dead_nodes_lock); hlist_add_head(&node->dead_node, &binder_dead_nodes); spin_unlock(&binder_dead_nodes_lock); hlist_for_each_entry(ref, &node->refs, node_entry) { refs++; /* * Need the node lock to synchronize * with new notification requests and the * inner lock to synchronize with queued * death notifications. */ binder_inner_proc_lock(ref->proc); if (!ref->death) { binder_inner_proc_unlock(ref->proc); continue; } death++; BUG_ON(!list_empty(&ref->death->work.entry)); ref->death->work.type = BINDER_WORK_DEAD_BINDER; binder_enqueue_work_ilocked(&ref->death->work, &ref->proc->todo); binder_wakeup_proc_ilocked(ref->proc); binder_inner_proc_unlock(ref->proc); } binder_debug(BINDER_DEBUG_DEAD_BINDER, "node %d now dead, refs %d, death %d\n", node->debug_id, refs, death); binder_node_unlock(node); binder_put_node(node); return refs; } static void binder_deferred_release(struct binder_proc *proc) { struct binder_context *context = proc->context; struct rb_node *n; int threads, nodes, incoming_refs, outgoing_refs, active_transactions; mutex_lock(&binder_procs_lock); hlist_del(&proc->proc_node); mutex_unlock(&binder_procs_lock); mutex_lock(&context->context_mgr_node_lock); if (context->binder_context_mgr_node && context->binder_context_mgr_node->proc == proc) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "%s: %d context_mgr_node gone\n", __func__, proc->pid); context->binder_context_mgr_node = NULL; } mutex_unlock(&context->context_mgr_node_lock); binder_inner_proc_lock(proc); /* * Make sure proc stays alive after we * remove all the threads */ proc->tmp_ref++; proc->is_dead = true; proc->is_frozen = false; proc->sync_recv = false; proc->async_recv = false; threads = 0; active_transactions = 0; while ((n = rb_first(&proc->threads))) { struct binder_thread *thread; thread = rb_entry(n, struct binder_thread, rb_node); binder_inner_proc_unlock(proc); threads++; active_transactions += binder_thread_release(proc, thread); binder_inner_proc_lock(proc); } nodes = 0; incoming_refs = 0; while ((n = rb_first(&proc->nodes))) { struct binder_node *node; node = rb_entry(n, struct binder_node, rb_node); nodes++; /* * take a temporary ref on the node before * calling binder_node_release() which will either * kfree() the node or call binder_put_node() */ binder_inc_node_tmpref_ilocked(node); rb_erase(&node->rb_node, &proc->nodes); binder_inner_proc_unlock(proc); incoming_refs = binder_node_release(node, incoming_refs); binder_inner_proc_lock(proc); } binder_inner_proc_unlock(proc); outgoing_refs = 0; binder_proc_lock(proc); while ((n = rb_first(&proc->refs_by_desc))) { struct binder_ref *ref; ref = rb_entry(n, struct binder_ref, rb_node_desc); outgoing_refs++; binder_cleanup_ref_olocked(ref); binder_proc_unlock(proc); binder_free_ref(ref); binder_proc_lock(proc); } binder_proc_unlock(proc); binder_release_work(proc, &proc->todo); binder_release_work(proc, &proc->delivered_death); binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d\n", __func__, proc->pid, threads, nodes, incoming_refs, outgoing_refs, active_transactions); binder_proc_dec_tmpref(proc); } static void binder_deferred_func(struct work_struct *work) { struct binder_proc *proc; int defer; do { mutex_lock(&binder_deferred_lock); if (!hlist_empty(&binder_deferred_list)) { proc = hlist_entry(binder_deferred_list.first, struct binder_proc, deferred_work_node); hlist_del_init(&proc->deferred_work_node); defer = proc->deferred_work; proc->deferred_work = 0; } else { proc = NULL; defer = 0; } mutex_unlock(&binder_deferred_lock); if (defer & BINDER_DEFERRED_FLUSH) binder_deferred_flush(proc); if (defer & BINDER_DEFERRED_RELEASE) binder_deferred_release(proc); /* frees proc */ } while (proc); } static DECLARE_WORK(binder_deferred_work, binder_deferred_func); static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer) { mutex_lock(&binder_deferred_lock); proc->deferred_work |= defer; if (hlist_unhashed(&proc->deferred_work_node)) { hlist_add_head(&proc->deferred_work_node, &binder_deferred_list); schedule_work(&binder_deferred_work); } mutex_unlock(&binder_deferred_lock); } static void print_binder_transaction_ilocked(struct seq_file *m, struct binder_proc *proc, const char *prefix, struct binder_transaction *t) { struct binder_proc *to_proc; struct binder_buffer *buffer = t->buffer; ktime_t current_time = ktime_get(); spin_lock(&t->lock); to_proc = t->to_proc; seq_printf(m, "%s %d: %pK from %d:%d to %d:%d code %x flags %x pri %ld r%d elapsed %lldms", prefix, t->debug_id, t, t->from_pid, t->from_tid, to_proc ? to_proc->pid : 0, t->to_thread ? t->to_thread->pid : 0, t->code, t->flags, t->priority, t->need_reply, ktime_ms_delta(current_time, t->start_time)); spin_unlock(&t->lock); if (proc != to_proc) { /* * Can only safely deref buffer if we are holding the * correct proc inner lock for this node */ seq_puts(m, "\n"); return; } if (buffer == NULL) { seq_puts(m, " buffer free\n"); return; } if (buffer->target_node) seq_printf(m, " node %d", buffer->target_node->debug_id); seq_printf(m, " size %zd:%zd offset %lx\n", buffer->data_size, buffer->offsets_size, proc->alloc.buffer - buffer->user_data); } static void print_binder_work_ilocked(struct seq_file *m, struct binder_proc *proc, const char *prefix, const char *transaction_prefix, struct binder_work *w) { struct binder_node *node; struct binder_transaction *t; switch (w->type) { case BINDER_WORK_TRANSACTION: t = container_of(w, struct binder_transaction, work); print_binder_transaction_ilocked( m, proc, transaction_prefix, t); break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( w, struct binder_error, work); seq_printf(m, "%stransaction error: %u\n", prefix, e->cmd); } break; case BINDER_WORK_TRANSACTION_COMPLETE: seq_printf(m, "%stransaction complete\n", prefix); break; case BINDER_WORK_NODE: node = container_of(w, struct binder_node, work); seq_printf(m, "%snode work %d: u%016llx c%016llx\n", prefix, node->debug_id, (u64)node->ptr, (u64)node->cookie); break; case BINDER_WORK_DEAD_BINDER: seq_printf(m, "%shas dead binder\n", prefix); break; case BINDER_WORK_DEAD_BINDER_AND_CLEAR: seq_printf(m, "%shas cleared dead binder\n", prefix); break; case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: seq_printf(m, "%shas cleared death notification\n", prefix); break; default: seq_printf(m, "%sunknown work: type %d\n", prefix, w->type); break; } } static void print_binder_thread_ilocked(struct seq_file *m, struct binder_thread *thread, int print_always) { struct binder_transaction *t; struct binder_work *w; size_t start_pos = m->count; size_t header_pos; seq_printf(m, " thread %d: l %02x need_return %d tr %d\n", thread->pid, thread->looper, thread->looper_need_return, atomic_read(&thread->tmp_ref)); header_pos = m->count; t = thread->transaction_stack; while (t) { if (t->from == thread) { print_binder_transaction_ilocked(m, thread->proc, " outgoing transaction", t); t = t->from_parent; } else if (t->to_thread == thread) { print_binder_transaction_ilocked(m, thread->proc, " incoming transaction", t); t = t->to_parent; } else { print_binder_transaction_ilocked(m, thread->proc, " bad transaction", t); t = NULL; } } list_for_each_entry(w, &thread->todo, entry) { print_binder_work_ilocked(m, thread->proc, " ", " pending transaction", w); } if (!print_always && m->count == header_pos) m->count = start_pos; } static void print_binder_node_nilocked(struct seq_file *m, struct binder_node *node) { struct binder_ref *ref; struct binder_work *w; int count; count = hlist_count_nodes(&node->refs); seq_printf(m, " node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d tr %d", node->debug_id, (u64)node->ptr, (u64)node->cookie, node->has_strong_ref, node->has_weak_ref, node->local_strong_refs, node->local_weak_refs, node->internal_strong_refs, count, node->tmp_refs); if (count) { seq_puts(m, " proc"); hlist_for_each_entry(ref, &node->refs, node_entry) seq_printf(m, " %d", ref->proc->pid); } seq_puts(m, "\n"); if (node->proc) { list_for_each_entry(w, &node->async_todo, entry) print_binder_work_ilocked(m, node->proc, " ", " pending async transaction", w); } } static void print_binder_ref_olocked(struct seq_file *m, struct binder_ref *ref) { binder_node_lock(ref->node); seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %pK\n", ref->data.debug_id, ref->data.desc, ref->node->proc ? "" : "dead ", ref->node->debug_id, ref->data.strong, ref->data.weak, ref->death); binder_node_unlock(ref->node); } static void print_binder_proc(struct seq_file *m, struct binder_proc *proc, int print_all) { struct binder_work *w; struct rb_node *n; size_t start_pos = m->count; size_t header_pos; struct binder_node *last_node = NULL; seq_printf(m, "proc %d\n", proc->pid); seq_printf(m, "context %s\n", proc->context->name); header_pos = m->count; binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread, rb_node), print_all); for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { struct binder_node *node = rb_entry(n, struct binder_node, rb_node); if (!print_all && !node->has_async_transaction) continue; /* * take a temporary reference on the node so it * survives and isn't removed from the tree * while we print it. */ binder_inc_node_tmpref_ilocked(node); /* Need to drop inner lock to take node lock */ binder_inner_proc_unlock(proc); if (last_node) binder_put_node(last_node); binder_node_inner_lock(node); print_binder_node_nilocked(m, node); binder_node_inner_unlock(node); last_node = node; binder_inner_proc_lock(proc); } binder_inner_proc_unlock(proc); if (last_node) binder_put_node(last_node); if (print_all) { binder_proc_lock(proc); for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) print_binder_ref_olocked(m, rb_entry(n, struct binder_ref, rb_node_desc)); binder_proc_unlock(proc); } binder_alloc_print_allocated(m, &proc->alloc); binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) print_binder_work_ilocked(m, proc, " ", " pending transaction", w); list_for_each_entry(w, &proc->delivered_death, entry) { seq_puts(m, " has delivered dead binder\n"); break; } binder_inner_proc_unlock(proc); if (!print_all && m->count == header_pos) m->count = start_pos; } static const char * const binder_return_strings[] = { "BR_ERROR", "BR_OK", "BR_TRANSACTION", "BR_REPLY", "BR_ACQUIRE_RESULT", "BR_DEAD_REPLY", "BR_TRANSACTION_COMPLETE", "BR_INCREFS", "BR_ACQUIRE", "BR_RELEASE", "BR_DECREFS", "BR_ATTEMPT_ACQUIRE", "BR_NOOP", "BR_SPAWN_LOOPER", "BR_FINISHED", "BR_DEAD_BINDER", "BR_CLEAR_DEATH_NOTIFICATION_DONE", "BR_FAILED_REPLY", "BR_FROZEN_REPLY", "BR_ONEWAY_SPAM_SUSPECT", "BR_TRANSACTION_PENDING_FROZEN" }; static const char * const binder_command_strings[] = { "BC_TRANSACTION", "BC_REPLY", "BC_ACQUIRE_RESULT", "BC_FREE_BUFFER", "BC_INCREFS", "BC_ACQUIRE", "BC_RELEASE", "BC_DECREFS", "BC_INCREFS_DONE", "BC_ACQUIRE_DONE", "BC_ATTEMPT_ACQUIRE", "BC_REGISTER_LOOPER", "BC_ENTER_LOOPER", "BC_EXIT_LOOPER", "BC_REQUEST_DEATH_NOTIFICATION", "BC_CLEAR_DEATH_NOTIFICATION", "BC_DEAD_BINDER_DONE", "BC_TRANSACTION_SG", "BC_REPLY_SG", }; static const char * const binder_objstat_strings[] = { "proc", "thread", "node", "ref", "death", "transaction", "transaction_complete" }; static void print_binder_stats(struct seq_file *m, const char *prefix, struct binder_stats *stats) { int i; BUILD_BUG_ON(ARRAY_SIZE(stats->bc) != ARRAY_SIZE(binder_command_strings)); for (i = 0; i < ARRAY_SIZE(stats->bc); i++) { int temp = atomic_read(&stats->bc[i]); if (temp) seq_printf(m, "%s%s: %d\n", prefix, binder_command_strings[i], temp); } BUILD_BUG_ON(ARRAY_SIZE(stats->br) != ARRAY_SIZE(binder_return_strings)); for (i = 0; i < ARRAY_SIZE(stats->br); i++) { int temp = atomic_read(&stats->br[i]); if (temp) seq_printf(m, "%s%s: %d\n", prefix, binder_return_strings[i], temp); } BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != ARRAY_SIZE(binder_objstat_strings)); BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != ARRAY_SIZE(stats->obj_deleted)); for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) { int created = atomic_read(&stats->obj_created[i]); int deleted = atomic_read(&stats->obj_deleted[i]); if (created || deleted) seq_printf(m, "%s%s: active %d total %d\n", prefix, binder_objstat_strings[i], created - deleted, created); } } static void print_binder_proc_stats(struct seq_file *m, struct binder_proc *proc) { struct binder_work *w; struct binder_thread *thread; struct rb_node *n; int count, strong, weak, ready_threads; size_t free_async_space = binder_alloc_get_free_async_space(&proc->alloc); seq_printf(m, "proc %d\n", proc->pid); seq_printf(m, "context %s\n", proc->context->name); count = 0; ready_threads = 0; binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) count++; list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node) ready_threads++; seq_printf(m, " threads: %d\n", count); seq_printf(m, " requested threads: %d+%d/%d\n" " ready threads %d\n" " free async space %zd\n", proc->requested_threads, proc->requested_threads_started, proc->max_threads, ready_threads, free_async_space); count = 0; for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) count++; binder_inner_proc_unlock(proc); seq_printf(m, " nodes: %d\n", count); count = 0; strong = 0; weak = 0; binder_proc_lock(proc); for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { struct binder_ref *ref = rb_entry(n, struct binder_ref, rb_node_desc); count++; strong += ref->data.strong; weak += ref->data.weak; } binder_proc_unlock(proc); seq_printf(m, " refs: %d s %d w %d\n", count, strong, weak); count = binder_alloc_get_allocated_count(&proc->alloc); seq_printf(m, " buffers: %d\n", count); binder_alloc_print_pages(m, &proc->alloc); count = 0; binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) { if (w->type == BINDER_WORK_TRANSACTION) count++; } binder_inner_proc_unlock(proc); seq_printf(m, " pending transactions: %d\n", count); print_binder_stats(m, " ", &proc->stats); } static int state_show(struct seq_file *m, void *unused) { struct binder_proc *proc; struct binder_node *node; struct binder_node *last_node = NULL; seq_puts(m, "binder state:\n"); spin_lock(&binder_dead_nodes_lock); if (!hlist_empty(&binder_dead_nodes)) seq_puts(m, "dead nodes:\n"); hlist_for_each_entry(node, &binder_dead_nodes, dead_node) { /* * take a temporary reference on the node so it * survives and isn't removed from the list * while we print it. */ node->tmp_refs++; spin_unlock(&binder_dead_nodes_lock); if (last_node) binder_put_node(last_node); binder_node_lock(node); print_binder_node_nilocked(m, node); binder_node_unlock(node); last_node = node; spin_lock(&binder_dead_nodes_lock); } spin_unlock(&binder_dead_nodes_lock); if (last_node) binder_put_node(last_node); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, 1); mutex_unlock(&binder_procs_lock); return 0; } static int stats_show(struct seq_file *m, void *unused) { struct binder_proc *proc; seq_puts(m, "binder stats:\n"); print_binder_stats(m, "", &binder_stats); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc_stats(m, proc); mutex_unlock(&binder_procs_lock); return 0; } static int transactions_show(struct seq_file *m, void *unused) { struct binder_proc *proc; seq_puts(m, "binder transactions:\n"); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, 0); mutex_unlock(&binder_procs_lock); return 0; } static int proc_show(struct seq_file *m, void *unused) { struct binder_proc *itr; int pid = (unsigned long)m->private; mutex_lock(&binder_procs_lock); hlist_for_each_entry(itr, &binder_procs, proc_node) { if (itr->pid == pid) { seq_puts(m, "binder proc state:\n"); print_binder_proc(m, itr, 1); } } mutex_unlock(&binder_procs_lock); return 0; } static void print_binder_transaction_log_entry(struct seq_file *m, struct binder_transaction_log_entry *e) { int debug_id = READ_ONCE(e->debug_id_done); /* * read barrier to guarantee debug_id_done read before * we print the log values */ smp_rmb(); seq_printf(m, "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d", e->debug_id, (e->call_type == 2) ? "reply" : ((e->call_type == 1) ? "async" : "call "), e->from_proc, e->from_thread, e->to_proc, e->to_thread, e->context_name, e->to_node, e->target_handle, e->data_size, e->offsets_size, e->return_error, e->return_error_param, e->return_error_line); /* * read-barrier to guarantee read of debug_id_done after * done printing the fields of the entry */ smp_rmb(); seq_printf(m, debug_id && debug_id == READ_ONCE(e->debug_id_done) ? "\n" : " (incomplete)\n"); } static int transaction_log_show(struct seq_file *m, void *unused) { struct binder_transaction_log *log = m->private; unsigned int log_cur = atomic_read(&log->cur); unsigned int count; unsigned int cur; int i; count = log_cur + 1; cur = count < ARRAY_SIZE(log->entry) && !log->full ? 0 : count % ARRAY_SIZE(log->entry); if (count > ARRAY_SIZE(log->entry) || log->full) count = ARRAY_SIZE(log->entry); for (i = 0; i < count; i++) { unsigned int index = cur++ % ARRAY_SIZE(log->entry); print_binder_transaction_log_entry(m, &log->entry[index]); } return 0; } const struct file_operations binder_fops = { .owner = THIS_MODULE, .poll = binder_poll, .unlocked_ioctl = binder_ioctl, .compat_ioctl = compat_ptr_ioctl, .mmap = binder_mmap, .open = binder_open, .flush = binder_flush, .release = binder_release, }; DEFINE_SHOW_ATTRIBUTE(state); DEFINE_SHOW_ATTRIBUTE(stats); DEFINE_SHOW_ATTRIBUTE(transactions); DEFINE_SHOW_ATTRIBUTE(transaction_log); const struct binder_debugfs_entry binder_debugfs_entries[] = { { .name = "state", .mode = 0444, .fops = &state_fops, .data = NULL, }, { .name = "stats", .mode = 0444, .fops = &stats_fops, .data = NULL, }, { .name = "transactions", .mode = 0444, .fops = &transactions_fops, .data = NULL, }, { .name = "transaction_log", .mode = 0444, .fops = &transaction_log_fops, .data = &binder_transaction_log, }, { .name = "failed_transaction_log", .mode = 0444, .fops = &transaction_log_fops, .data = &binder_transaction_log_failed, }, {} /* terminator */ }; static int __init init_binder_device(const char *name) { int ret; struct binder_device *binder_device; binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL); if (!binder_device) return -ENOMEM; binder_device->miscdev.fops = &binder_fops; binder_device->miscdev.minor = MISC_DYNAMIC_MINOR; binder_device->miscdev.name = name; refcount_set(&binder_device->ref, 1); binder_device->context.binder_context_mgr_uid = INVALID_UID; binder_device->context.name = name; mutex_init(&binder_device->context.context_mgr_node_lock); ret = misc_register(&binder_device->miscdev); if (ret < 0) { kfree(binder_device); return ret; } hlist_add_head(&binder_device->hlist, &binder_devices); return ret; } static int __init binder_init(void) { int ret; char *device_name, *device_tmp; struct binder_device *device; struct hlist_node *tmp; char *device_names = NULL; const struct binder_debugfs_entry *db_entry; ret = binder_alloc_shrinker_init(); if (ret) return ret; atomic_set(&binder_transaction_log.cur, ~0U); atomic_set(&binder_transaction_log_failed.cur, ~0U); binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL); binder_for_each_debugfs_entry(db_entry) debugfs_create_file(db_entry->name, db_entry->mode, binder_debugfs_dir_entry_root, db_entry->data, db_entry->fops); binder_debugfs_dir_entry_proc = debugfs_create_dir("proc", binder_debugfs_dir_entry_root); if (!IS_ENABLED(CONFIG_ANDROID_BINDERFS) && strcmp(binder_devices_param, "") != 0) { /* * Copy the module_parameter string, because we don't want to * tokenize it in-place. */ device_names = kstrdup(binder_devices_param, GFP_KERNEL); if (!device_names) { ret = -ENOMEM; goto err_alloc_device_names_failed; } device_tmp = device_names; while ((device_name = strsep(&device_tmp, ","))) { ret = init_binder_device(device_name); if (ret) goto err_init_binder_device_failed; } } ret = init_binderfs(); if (ret) goto err_init_binder_device_failed; return ret; err_init_binder_device_failed: hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) { misc_deregister(&device->miscdev); hlist_del(&device->hlist); kfree(device); } kfree(device_names); err_alloc_device_names_failed: debugfs_remove_recursive(binder_debugfs_dir_entry_root); binder_alloc_shrinker_exit(); return ret; } device_initcall(binder_init); #define CREATE_TRACE_POINTS #include "binder_trace.h" MODULE_LICENSE("GPL v2"); |
7 1 1 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | /* Kernel module to match connection tracking byte counter. * GPL (C) 2002 Martin Devera (devik@cdi.cz). */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/bitops.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connbytes.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching"); MODULE_ALIAS("ipt_connbytes"); MODULE_ALIAS("ip6t_connbytes"); static bool connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connbytes_info *sinfo = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int64_t what = 0; /* initialize to make gcc happy */ u_int64_t bytes = 0; u_int64_t pkts = 0; const struct nf_conn_acct *acct; const struct nf_conn_counter *counters; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; acct = nf_conn_acct_find(ct); if (!acct) return false; counters = acct->counter; switch (sinfo->what) { case XT_CONNBYTES_PKTS: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: what = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); what += atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } break; case XT_CONNBYTES_BYTES: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); break; case XT_CONNBYTES_DIR_REPLY: what = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; case XT_CONNBYTES_DIR_BOTH: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); what += atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; } break; case XT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: bytes = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes) + atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets) + atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } if (pkts != 0) what = div64_u64(bytes, pkts); break; } if (sinfo->count.to >= sinfo->count.from) return what <= sinfo->count.to && what >= sinfo->count.from; else /* inverted */ return what < sinfo->count.to || what > sinfo->count.from; } static int connbytes_mt_check(const struct xt_mtchk_param *par) { const struct xt_connbytes_info *sinfo = par->matchinfo; int ret; if (sinfo->what != XT_CONNBYTES_PKTS && sinfo->what != XT_CONNBYTES_BYTES && sinfo->what != XT_CONNBYTES_AVGPKT) return -EINVAL; if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL && sinfo->direction != XT_CONNBYTES_DIR_REPLY && sinfo->direction != XT_CONNBYTES_DIR_BOTH) return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); /* * This filter cannot function correctly unless connection tracking * accounting is enabled, so complain in the hope that someone notices. */ if (!nf_ct_acct_enabled(par->net)) { pr_warn("Forcing CT accounting to be enabled\n"); nf_ct_set_acct(par->net, true); } return ret; } static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match connbytes_mt_reg __read_mostly = { .name = "connbytes", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = connbytes_mt_check, .match = connbytes_mt, .destroy = connbytes_mt_destroy, .matchsize = sizeof(struct xt_connbytes_info), .me = THIS_MODULE, }; static int __init connbytes_mt_init(void) { return xt_register_match(&connbytes_mt_reg); } static void __exit connbytes_mt_exit(void) { xt_unregister_match(&connbytes_mt_reg); } module_init(connbytes_mt_init); module_exit(connbytes_mt_exit); |
100 100 91 88 85 6 97 87 87 105 105 95 96 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 | /* mpiutil.ac - Utility functions for MPI * Copyright (C) 1998, 1999 Free Software Foundation, Inc. * * This file is part of GnuPG. * * GnuPG is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * GnuPG is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include "mpi-internal.h" /* Constants allocated right away at startup. */ static MPI constants[MPI_NUMBER_OF_CONSTANTS]; /* Initialize the MPI subsystem. This is called early and allows to * do some initialization without taking care of threading issues. */ static int __init mpi_init(void) { int idx; unsigned long value; for (idx = 0; idx < MPI_NUMBER_OF_CONSTANTS; idx++) { switch (idx) { case MPI_C_ZERO: value = 0; break; case MPI_C_ONE: value = 1; break; case MPI_C_TWO: value = 2; break; case MPI_C_THREE: value = 3; break; case MPI_C_FOUR: value = 4; break; case MPI_C_EIGHT: value = 8; break; default: pr_err("MPI: invalid mpi_const selector %d\n", idx); return -EFAULT; } constants[idx] = mpi_alloc_set_ui(value); constants[idx]->flags = (16|32); } return 0; } postcore_initcall(mpi_init); /* Return a constant MPI descripbed by NO which is one of the * MPI_C_xxx macros. There is no need to copy this returned value; it * may be used directly. */ MPI mpi_const(enum gcry_mpi_constants no) { if ((int)no < 0 || no > MPI_NUMBER_OF_CONSTANTS) pr_err("MPI: invalid mpi_const selector %d\n", no); if (!constants[no]) pr_err("MPI: MPI subsystem not initialized\n"); return constants[no]; } EXPORT_SYMBOL_GPL(mpi_const); /**************** * Note: It was a bad idea to use the number of limbs to allocate * because on a alpha the limbs are large but we normally need * integers of n bits - So we should change this to bits (or bytes). * * But mpi_alloc is used in a lot of places :-) */ MPI mpi_alloc(unsigned nlimbs) { MPI a; a = kmalloc(sizeof *a, GFP_KERNEL); if (!a) return a; if (nlimbs) { a->d = mpi_alloc_limb_space(nlimbs); if (!a->d) { kfree(a); return NULL; } } else { a->d = NULL; } a->alloced = nlimbs; a->nlimbs = 0; a->sign = 0; a->flags = 0; a->nbits = 0; return a; } EXPORT_SYMBOL_GPL(mpi_alloc); mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs) { size_t len = nlimbs * sizeof(mpi_limb_t); if (!len) return NULL; return kmalloc(len, GFP_KERNEL); } void mpi_free_limb_space(mpi_ptr_t a) { if (!a) return; kfree_sensitive(a); } void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs) { mpi_free_limb_space(a->d); a->d = ap; a->alloced = nlimbs; } /**************** * Resize the array of A to NLIMBS. the additional space is cleared * (set to 0) [done by m_realloc()] */ int mpi_resize(MPI a, unsigned nlimbs) { void *p; if (nlimbs <= a->alloced) return 0; /* no need to do it */ if (a->d) { p = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL); if (!p) return -ENOMEM; memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t)); kfree_sensitive(a->d); a->d = p; } else { a->d = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL); if (!a->d) return -ENOMEM; } a->alloced = nlimbs; return 0; } void mpi_clear(MPI a) { if (!a) return; a->nlimbs = 0; a->flags = 0; } EXPORT_SYMBOL_GPL(mpi_clear); void mpi_free(MPI a) { if (!a) return; if (a->flags & 4) kfree_sensitive(a->d); else mpi_free_limb_space(a->d); if (a->flags & ~7) pr_info("invalid flag value in mpi\n"); kfree(a); } EXPORT_SYMBOL_GPL(mpi_free); /**************** * Note: This copy function should not interpret the MPI * but copy it transparently. */ MPI mpi_copy(MPI a) { int i; MPI b; if (a) { b = mpi_alloc(a->nlimbs); b->nlimbs = a->nlimbs; b->sign = a->sign; b->flags = a->flags; b->flags &= ~(16|32); /* Reset the immutable and constant flags. */ for (i = 0; i < b->nlimbs; i++) b->d[i] = a->d[i]; } else b = NULL; return b; } /**************** * This function allocates an MPI which is optimized to hold * a value as large as the one given in the argument and allocates it * with the same flags as A. */ MPI mpi_alloc_like(MPI a) { MPI b; if (a) { b = mpi_alloc(a->nlimbs); b->nlimbs = 0; b->sign = 0; b->flags = a->flags; } else b = NULL; return b; } /* Set U into W and release U. If W is NULL only U will be released. */ void mpi_snatch(MPI w, MPI u) { if (w) { mpi_assign_limb_space(w, u->d, u->alloced); w->nlimbs = u->nlimbs; w->sign = u->sign; w->flags = u->flags; u->alloced = 0; u->nlimbs = 0; u->d = NULL; } mpi_free(u); } MPI mpi_set(MPI w, MPI u) { mpi_ptr_t wp, up; mpi_size_t usize = u->nlimbs; int usign = u->sign; if (!w) w = mpi_alloc(mpi_get_nlimbs(u)); RESIZE_IF_NEEDED(w, usize); wp = w->d; up = u->d; MPN_COPY(wp, up, usize); w->nlimbs = usize; w->flags = u->flags; w->flags &= ~(16|32); /* Reset the immutable and constant flags. */ w->sign = usign; return w; } EXPORT_SYMBOL_GPL(mpi_set); MPI mpi_set_ui(MPI w, unsigned long u) { if (!w) w = mpi_alloc(1); /* FIXME: If U is 0 we have no need to resize and thus possible * allocating the limbs. */ RESIZE_IF_NEEDED(w, 1); w->d[0] = u; w->nlimbs = u ? 1 : 0; w->sign = 0; w->flags = 0; return w; } EXPORT_SYMBOL_GPL(mpi_set_ui); MPI mpi_alloc_set_ui(unsigned long u) { MPI w = mpi_alloc(1); w->d[0] = u; w->nlimbs = u ? 1 : 0; w->sign = 0; return w; } /**************** * Swap the value of A and B, when SWAP is 1. * Leave the value when SWAP is 0. * This implementation should be constant-time regardless of SWAP. */ void mpi_swap_cond(MPI a, MPI b, unsigned long swap) { mpi_size_t i; mpi_size_t nlimbs; mpi_limb_t mask = ((mpi_limb_t)0) - swap; mpi_limb_t x; if (a->alloced > b->alloced) nlimbs = b->alloced; else nlimbs = a->alloced; if (a->nlimbs > nlimbs || b->nlimbs > nlimbs) return; for (i = 0; i < nlimbs; i++) { x = mask & (a->d[i] ^ b->d[i]); a->d[i] = a->d[i] ^ x; b->d[i] = b->d[i] ^ x; } x = mask & (a->nlimbs ^ b->nlimbs); a->nlimbs = a->nlimbs ^ x; b->nlimbs = b->nlimbs ^ x; x = mask & (a->sign ^ b->sign); a->sign = a->sign ^ x; b->sign = b->sign ^ x; } MODULE_DESCRIPTION("Multiprecision maths library"); MODULE_LICENSE("GPL"); |
1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 | // SPDX-License-Identifier: GPL-2.0-or-later /* * DVB USB Linux driver for Afatech AF9015 DVB-T USB2.0 receiver * * Copyright (C) 2007 Antti Palosaari <crope@iki.fi> * * Thanks to Afatech who kindly provided information. */ #include "af9015.h" static int dvb_usb_af9015_remote; module_param_named(remote, dvb_usb_af9015_remote, int, 0644); MODULE_PARM_DESC(remote, "select remote"); DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); static int af9015_ctrl_msg(struct dvb_usb_device *d, struct req_t *req) { #define REQ_HDR_LEN 8 /* send header size */ #define ACK_HDR_LEN 2 /* rece header size */ struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret, wlen, rlen; u8 write = 1; mutex_lock(&d->usb_mutex); state->buf[0] = req->cmd; state->buf[1] = state->seq++; state->buf[2] = req->i2c_addr << 1; state->buf[3] = req->addr >> 8; state->buf[4] = req->addr & 0xff; state->buf[5] = req->mbox; state->buf[6] = req->addr_len; state->buf[7] = req->data_len; switch (req->cmd) { case GET_CONFIG: case READ_MEMORY: case RECONNECT_USB: write = 0; break; case READ_I2C: write = 0; state->buf[2] |= 0x01; /* set I2C direction */ fallthrough; case WRITE_I2C: state->buf[0] = READ_WRITE_I2C; break; case WRITE_MEMORY: if (((req->addr & 0xff00) == 0xff00) || ((req->addr & 0xff00) == 0xae00)) state->buf[0] = WRITE_VIRTUAL_MEMORY; break; case WRITE_VIRTUAL_MEMORY: case COPY_FIRMWARE: case DOWNLOAD_FIRMWARE: case BOOT: break; default: dev_err(&intf->dev, "unknown cmd %d\n", req->cmd); ret = -EIO; goto error; } /* Buffer overflow check */ if ((write && (req->data_len > BUF_LEN - REQ_HDR_LEN)) || (!write && (req->data_len > BUF_LEN - ACK_HDR_LEN))) { dev_err(&intf->dev, "too much data, cmd %u, len %u\n", req->cmd, req->data_len); ret = -EINVAL; goto error; } /* * Write receives seq + status = 2 bytes * Read receives seq + status + data = 2 + N bytes */ wlen = REQ_HDR_LEN; rlen = ACK_HDR_LEN; if (write) { wlen += req->data_len; memcpy(&state->buf[REQ_HDR_LEN], req->data, req->data_len); } else { rlen += req->data_len; } /* no ack for these packets */ if (req->cmd == DOWNLOAD_FIRMWARE || req->cmd == RECONNECT_USB) rlen = 0; ret = dvb_usbv2_generic_rw_locked(d, state->buf, wlen, state->buf, rlen); if (ret) goto error; /* check status */ if (rlen && state->buf[1]) { dev_err(&intf->dev, "cmd failed %u\n", state->buf[1]); ret = -EIO; goto error; } /* read request, copy returned data to return buf */ if (!write) memcpy(req->data, &state->buf[ACK_HDR_LEN], req->data_len); error: mutex_unlock(&d->usb_mutex); return ret; } static int af9015_write_reg_i2c(struct dvb_usb_device *d, u8 addr, u16 reg, u8 val) { struct af9015_state *state = d_to_priv(d); struct req_t req = {WRITE_I2C, addr, reg, 1, 1, 1, &val}; if (addr == state->af9013_i2c_addr[0] || addr == state->af9013_i2c_addr[1]) req.addr_len = 3; return af9015_ctrl_msg(d, &req); } static int af9015_read_reg_i2c(struct dvb_usb_device *d, u8 addr, u16 reg, u8 *val) { struct af9015_state *state = d_to_priv(d); struct req_t req = {READ_I2C, addr, reg, 0, 1, 1, val}; if (addr == state->af9013_i2c_addr[0] || addr == state->af9013_i2c_addr[1]) req.addr_len = 3; return af9015_ctrl_msg(d, &req); } static int af9015_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; u16 addr; u8 mbox, addr_len; struct req_t req; /* * I2C multiplexing: * There could be two tuners, both using same I2C address. Demodulator * I2C-gate is only possibility to select correct tuner. * * ........................................... * . AF9015 integrates AF9013 demodulator . * . ____________ ____________ . ____________ * .| USB IF | | demod |. | tuner | * .|------------| |------------|. |------------| * .| AF9015 | | AF9013 |. | MXL5003 | * .| |--+--I2C-----|-----/ -----|.----I2C-----| | * .| | | | addr 0x1c |. | addr 0x63 | * .|____________| | |____________|. |____________| * .................|......................... * | ____________ ____________ * | | demod | | tuner | * | |------------| |------------| * | | AF9013 | | MXL5003 | * +--I2C-----|-----/ -----|-----I2C-----| | * | addr 0x1d | | addr 0x63 | * |____________| |____________| */ if (msg[0].len == 0 || msg[0].flags & I2C_M_RD) { addr = 0x0000; mbox = 0; addr_len = 0; } else if (msg[0].len == 1) { addr = msg[0].buf[0]; mbox = 0; addr_len = 1; } else if (msg[0].len == 2) { addr = msg[0].buf[0] << 8 | msg[0].buf[1] << 0; mbox = 0; addr_len = 2; } else { addr = msg[0].buf[0] << 8 | msg[0].buf[1] << 0; mbox = msg[0].buf[2]; addr_len = 3; } if (num == 1 && !(msg[0].flags & I2C_M_RD)) { /* i2c write */ if (msg[0].len > 21) { ret = -EOPNOTSUPP; goto err; } if (msg[0].addr == state->af9013_i2c_addr[0]) req.cmd = WRITE_MEMORY; else req.cmd = WRITE_I2C; req.i2c_addr = msg[0].addr; req.addr = addr; req.mbox = mbox; req.addr_len = addr_len; req.data_len = msg[0].len - addr_len; req.data = &msg[0].buf[addr_len]; ret = af9015_ctrl_msg(d, &req); } else if (num == 2 && !(msg[0].flags & I2C_M_RD) && (msg[1].flags & I2C_M_RD)) { /* i2c write + read */ if (msg[0].len > 3 || msg[1].len > 61) { ret = -EOPNOTSUPP; goto err; } if (msg[0].addr == state->af9013_i2c_addr[0]) req.cmd = READ_MEMORY; else req.cmd = READ_I2C; req.i2c_addr = msg[0].addr; req.addr = addr; req.mbox = mbox; req.addr_len = addr_len; req.data_len = msg[1].len; req.data = &msg[1].buf[0]; ret = af9015_ctrl_msg(d, &req); } else if (num == 1 && (msg[0].flags & I2C_M_RD)) { /* i2c read */ if (msg[0].len > 61) { ret = -EOPNOTSUPP; goto err; } if (msg[0].addr == state->af9013_i2c_addr[0]) { ret = -EINVAL; goto err; } req.cmd = READ_I2C; req.i2c_addr = msg[0].addr; req.addr = addr; req.mbox = mbox; req.addr_len = addr_len; req.data_len = msg[0].len; req.data = &msg[0].buf[0]; ret = af9015_ctrl_msg(d, &req); } else { ret = -EOPNOTSUPP; dev_dbg(&intf->dev, "unknown msg, num %u\n", num); } if (ret) goto err; return num; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static u32 af9015_i2c_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C; } static struct i2c_algorithm af9015_i2c_algo = { .master_xfer = af9015_i2c_xfer, .functionality = af9015_i2c_func, }; static int af9015_identify_state(struct dvb_usb_device *d, const char **name) { struct usb_interface *intf = d->intf; int ret; u8 reply; struct req_t req = {GET_CONFIG, 0, 0, 0, 0, 1, &reply}; ret = af9015_ctrl_msg(d, &req); if (ret) return ret; dev_dbg(&intf->dev, "reply %02x\n", reply); if (reply == 0x02) ret = WARM; else ret = COLD; return ret; } static int af9015_download_firmware(struct dvb_usb_device *d, const struct firmware *firmware) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret, i, rem; struct req_t req = {DOWNLOAD_FIRMWARE, 0, 0, 0, 0, 0, NULL}; u16 checksum; dev_dbg(&intf->dev, "\n"); /* Calc checksum, we need it when copy firmware to slave demod */ for (i = 0, checksum = 0; i < firmware->size; i++) checksum += firmware->data[i]; state->firmware_size = firmware->size; state->firmware_checksum = checksum; #define LEN_MAX (BUF_LEN - REQ_HDR_LEN) /* Max payload size */ for (rem = firmware->size; rem > 0; rem -= LEN_MAX) { req.data_len = min(LEN_MAX, rem); req.data = (u8 *)&firmware->data[firmware->size - rem]; req.addr = 0x5100 + firmware->size - rem; ret = af9015_ctrl_msg(d, &req); if (ret) { dev_err(&intf->dev, "firmware download failed %d\n", ret); goto err; } } req.cmd = BOOT; req.data_len = 0; ret = af9015_ctrl_msg(d, &req); if (ret) { dev_err(&intf->dev, "firmware boot failed %d\n", ret); goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } #define AF9015_EEPROM_SIZE 256 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ #define GOLDEN_RATIO_PRIME_32 0x9e370001UL /* hash (and dump) eeprom */ static int af9015_eeprom_hash(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret, i; u8 buf[AF9015_EEPROM_SIZE]; struct req_t req = {READ_I2C, AF9015_I2C_EEPROM, 0, 0, 1, 1, NULL}; /* read eeprom */ for (i = 0; i < AF9015_EEPROM_SIZE; i++) { req.addr = i; req.data = &buf[i]; ret = af9015_ctrl_msg(d, &req); if (ret < 0) goto err; } /* calculate checksum */ for (i = 0; i < AF9015_EEPROM_SIZE / sizeof(u32); i++) { state->eeprom_sum *= GOLDEN_RATIO_PRIME_32; state->eeprom_sum += le32_to_cpu(((__le32 *)buf)[i]); } for (i = 0; i < AF9015_EEPROM_SIZE; i += 16) dev_dbg(&intf->dev, "%*ph\n", 16, buf + i); dev_dbg(&intf->dev, "eeprom sum %.8x\n", state->eeprom_sum); return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_read_config(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; u8 val, i, offset = 0; struct req_t req = {READ_I2C, AF9015_I2C_EEPROM, 0, 0, 1, 1, &val}; dev_dbg(&intf->dev, "\n"); /* IR remote controller */ req.addr = AF9015_EEPROM_IR_MODE; /* first message will timeout often due to possible hw bug */ for (i = 0; i < 4; i++) { ret = af9015_ctrl_msg(d, &req); if (!ret) break; } if (ret) goto error; ret = af9015_eeprom_hash(d); if (ret) goto error; state->ir_mode = val; dev_dbg(&intf->dev, "ir mode %02x\n", val); /* TS mode - one or two receivers */ req.addr = AF9015_EEPROM_TS_MODE; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->dual_mode = val; dev_dbg(&intf->dev, "ts mode %02x\n", state->dual_mode); state->af9013_i2c_addr[0] = AF9015_I2C_DEMOD; if (state->dual_mode) { /* read 2nd demodulator I2C address */ req.addr = AF9015_EEPROM_DEMOD2_I2C; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->af9013_i2c_addr[1] = val >> 1; } for (i = 0; i < state->dual_mode + 1; i++) { if (i == 1) offset = AF9015_EEPROM_OFFSET; /* xtal */ req.addr = AF9015_EEPROM_XTAL_TYPE1 + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; switch (val) { case 0: state->af9013_pdata[i].clk = 28800000; break; case 1: state->af9013_pdata[i].clk = 20480000; break; case 2: state->af9013_pdata[i].clk = 28000000; break; case 3: state->af9013_pdata[i].clk = 25000000; break; } dev_dbg(&intf->dev, "[%d] xtal %02x, clk %u\n", i, val, state->af9013_pdata[i].clk); /* IF frequency */ req.addr = AF9015_EEPROM_IF1H + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->af9013_pdata[i].if_frequency = val << 8; req.addr = AF9015_EEPROM_IF1L + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->af9013_pdata[i].if_frequency += val; state->af9013_pdata[i].if_frequency *= 1000; dev_dbg(&intf->dev, "[%d] if frequency %u\n", i, state->af9013_pdata[i].if_frequency); /* MT2060 IF1 */ req.addr = AF9015_EEPROM_MT2060_IF1H + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->mt2060_if1[i] = val << 8; req.addr = AF9015_EEPROM_MT2060_IF1L + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->mt2060_if1[i] += val; dev_dbg(&intf->dev, "[%d] MT2060 IF1 %u\n", i, state->mt2060_if1[i]); /* tuner */ req.addr = AF9015_EEPROM_TUNER_ID1 + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; switch (val) { case AF9013_TUNER_ENV77H11D5: case AF9013_TUNER_MT2060: case AF9013_TUNER_QT1010: case AF9013_TUNER_UNKNOWN: case AF9013_TUNER_MT2060_2: case AF9013_TUNER_TDA18271: case AF9013_TUNER_QT1010A: case AF9013_TUNER_TDA18218: state->af9013_pdata[i].spec_inv = 1; break; case AF9013_TUNER_MXL5003D: case AF9013_TUNER_MXL5005D: case AF9013_TUNER_MXL5005R: case AF9013_TUNER_MXL5007T: state->af9013_pdata[i].spec_inv = 0; break; case AF9013_TUNER_MC44S803: state->af9013_pdata[i].gpio[1] = AF9013_GPIO_LO; state->af9013_pdata[i].spec_inv = 1; break; default: dev_err(&intf->dev, "tuner id %02x not supported, please report!\n", val); return -ENODEV; } state->af9013_pdata[i].tuner = val; dev_dbg(&intf->dev, "[%d] tuner id %02x\n", i, val); } error: if (ret) dev_err(&intf->dev, "eeprom read failed %d\n", ret); /* * AverMedia AVerTV Volar Black HD (A850) device have bad EEPROM * content :-( Override some wrong values here. Ditto for the * AVerTV Red HD+ (A850T) device. */ if (le16_to_cpu(d->udev->descriptor.idVendor) == USB_VID_AVERMEDIA && ((le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_AVERMEDIA_A850) || (le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_AVERMEDIA_A850T))) { dev_dbg(&intf->dev, "AverMedia A850: overriding config\n"); /* disable dual mode */ state->dual_mode = 0; /* set correct IF */ state->af9013_pdata[0].if_frequency = 4570000; } return ret; } static int af9015_get_stream_config(struct dvb_frontend *fe, u8 *ts_type, struct usb_data_stream_properties *stream) { struct dvb_usb_device *d = fe_to_d(fe); struct usb_interface *intf = d->intf; dev_dbg(&intf->dev, "adap %u\n", fe_to_adap(fe)->id); if (d->udev->speed == USB_SPEED_FULL) stream->u.bulk.buffersize = 5 * 188; return 0; } static int af9015_streaming_ctrl(struct dvb_frontend *fe, int onoff) { struct dvb_usb_device *d = fe_to_d(fe); struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; unsigned int utmp1, utmp2, reg1, reg2; u8 buf[2]; const unsigned int adap_id = fe_to_adap(fe)->id; dev_dbg(&intf->dev, "adap id %d, onoff %d\n", adap_id, onoff); if (!state->usb_ts_if_configured[adap_id]) { dev_dbg(&intf->dev, "set usb and ts interface\n"); /* USB IF stream settings */ utmp1 = (d->udev->speed == USB_SPEED_FULL ? 5 : 87) * 188 / 4; utmp2 = (d->udev->speed == USB_SPEED_FULL ? 64 : 512) / 4; buf[0] = (utmp1 >> 0) & 0xff; buf[1] = (utmp1 >> 8) & 0xff; if (adap_id == 0) { /* 1st USB IF (EP4) stream settings */ reg1 = 0xdd88; reg2 = 0xdd0c; } else { /* 2nd USB IF (EP5) stream settings */ reg1 = 0xdd8a; reg2 = 0xdd0d; } ret = regmap_bulk_write(state->regmap, reg1, buf, 2); if (ret) goto err; ret = regmap_write(state->regmap, reg2, utmp2); if (ret) goto err; /* TS IF settings */ if (state->dual_mode) { utmp1 = 0x01; utmp2 = 0x10; } else { utmp1 = 0x00; utmp2 = 0x00; } ret = regmap_update_bits(state->regmap, 0xd50b, 0x01, utmp1); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xd520, 0x10, utmp2); if (ret) goto err; state->usb_ts_if_configured[adap_id] = true; } if (adap_id == 0 && onoff) { /* Adapter 0 stream on. EP4: clear NAK, enable, clear reset */ ret = regmap_update_bits(state->regmap, 0xdd13, 0x20, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x20, 0x20); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xd507, 0x04, 0x00); if (ret) goto err; } else if (adap_id == 1 && onoff) { /* Adapter 1 stream on. EP5: clear NAK, enable, clear reset */ ret = regmap_update_bits(state->regmap, 0xdd13, 0x40, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x40, 0x40); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xd50b, 0x02, 0x00); if (ret) goto err; } else if (adap_id == 0 && !onoff) { /* Adapter 0 stream off. EP4: set reset, disable, set NAK */ ret = regmap_update_bits(state->regmap, 0xd507, 0x04, 0x04); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x20, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd13, 0x20, 0x20); if (ret) goto err; } else if (adap_id == 1 && !onoff) { /* Adapter 1 stream off. EP5: set reset, disable, set NAK */ ret = regmap_update_bits(state->regmap, 0xd50b, 0x02, 0x02); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x40, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd13, 0x40, 0x40); if (ret) goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_get_adapter_count(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); return state->dual_mode + 1; } /* override demod callbacks for resource locking */ static int af9015_af9013_set_frontend(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->set_frontend[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override demod callbacks for resource locking */ static int af9015_af9013_read_status(struct dvb_frontend *fe, enum fe_status *status) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->read_status[fe_to_adap(fe)->id](fe, status); mutex_unlock(&state->fe_mutex); return ret; } /* override demod callbacks for resource locking */ static int af9015_af9013_init(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->init[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override demod callbacks for resource locking */ static int af9015_af9013_sleep(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->sleep[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override tuner callbacks for resource locking */ static int af9015_tuner_init(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->tuner_init[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override tuner callbacks for resource locking */ static int af9015_tuner_sleep(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->tuner_sleep[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } static int af9015_copy_firmware(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; unsigned long timeout; u8 val, firmware_info[4]; struct req_t req = {COPY_FIRMWARE, 0, 0x5100, 0, 0, 4, firmware_info}; dev_dbg(&intf->dev, "\n"); firmware_info[0] = (state->firmware_size >> 8) & 0xff; firmware_info[1] = (state->firmware_size >> 0) & 0xff; firmware_info[2] = (state->firmware_checksum >> 8) & 0xff; firmware_info[3] = (state->firmware_checksum >> 0) & 0xff; /* Check whether firmware is already running */ ret = af9015_read_reg_i2c(d, state->af9013_i2c_addr[1], 0x98be, &val); if (ret) goto err; dev_dbg(&intf->dev, "firmware status %02x\n", val); if (val == 0x0c) return 0; /* Set i2c clock to 625kHz to speed up firmware copy */ ret = regmap_write(state->regmap, 0xd416, 0x04); if (ret) goto err; /* Copy firmware from master demod to slave demod */ ret = af9015_ctrl_msg(d, &req); if (ret) { dev_err(&intf->dev, "firmware copy cmd failed %d\n", ret); goto err; } /* Set i2c clock to 125kHz */ ret = regmap_write(state->regmap, 0xd416, 0x14); if (ret) goto err; /* Boot firmware */ ret = af9015_write_reg_i2c(d, state->af9013_i2c_addr[1], 0xe205, 0x01); if (ret) goto err; /* Poll firmware ready */ for (val = 0x00, timeout = jiffies + msecs_to_jiffies(1000); !time_after(jiffies, timeout) && val != 0x0c && val != 0x04;) { msleep(20); /* Check firmware status. 0c=OK, 04=fail */ ret = af9015_read_reg_i2c(d, state->af9013_i2c_addr[1], 0x98be, &val); if (ret) goto err; dev_dbg(&intf->dev, "firmware status %02x\n", val); } dev_dbg(&intf->dev, "firmware boot took %u ms\n", jiffies_to_msecs(jiffies) - (jiffies_to_msecs(timeout) - 1000)); if (val == 0x04) { ret = -ENODEV; dev_err(&intf->dev, "firmware did not run\n"); goto err; } else if (val != 0x0c) { ret = -ETIMEDOUT; dev_err(&intf->dev, "firmware boot timeout\n"); goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_af9013_frontend_attach(struct dvb_usb_adapter *adap) { struct af9015_state *state = adap_to_priv(adap); struct dvb_usb_device *d = adap_to_d(adap); struct usb_interface *intf = d->intf; struct i2c_client *client; int ret; dev_dbg(&intf->dev, "adap id %u\n", adap->id); if (adap->id == 0) { state->af9013_pdata[0].ts_mode = AF9013_TS_MODE_USB; memcpy(state->af9013_pdata[0].api_version, "\x0\x1\x9\x0", 4); state->af9013_pdata[0].gpio[0] = AF9013_GPIO_HI; state->af9013_pdata[0].gpio[3] = AF9013_GPIO_TUNER_ON; } else if (adap->id == 1) { state->af9013_pdata[1].ts_mode = AF9013_TS_MODE_SERIAL; state->af9013_pdata[1].ts_output_pin = 7; memcpy(state->af9013_pdata[1].api_version, "\x0\x1\x9\x0", 4); state->af9013_pdata[1].gpio[0] = AF9013_GPIO_TUNER_ON; state->af9013_pdata[1].gpio[1] = AF9013_GPIO_LO; /* copy firmware to 2nd demodulator */ if (state->dual_mode) { /* Wait 2nd demodulator ready */ msleep(100); ret = af9015_copy_firmware(adap_to_d(adap)); if (ret) { dev_err(&intf->dev, "firmware copy to 2nd frontend failed, will disable it\n"); state->dual_mode = 0; goto err; } } else { ret = -ENODEV; goto err; } } /* Add I2C demod */ client = dvb_module_probe("af9013", NULL, &d->i2c_adap, state->af9013_i2c_addr[adap->id], &state->af9013_pdata[adap->id]); if (!client) { ret = -ENODEV; goto err; } adap->fe[0] = state->af9013_pdata[adap->id].get_dvb_frontend(client); state->demod_i2c_client[adap->id] = client; /* * AF9015 firmware does not like if it gets interrupted by I2C adapter * request on some critical phases. During normal operation I2C adapter * is used only 2nd demodulator and tuner on dual tuner devices. * Override demodulator callbacks and use mutex for limit access to * those "critical" paths to keep AF9015 happy. */ if (adap->fe[0]) { state->set_frontend[adap->id] = adap->fe[0]->ops.set_frontend; adap->fe[0]->ops.set_frontend = af9015_af9013_set_frontend; state->read_status[adap->id] = adap->fe[0]->ops.read_status; adap->fe[0]->ops.read_status = af9015_af9013_read_status; state->init[adap->id] = adap->fe[0]->ops.init; adap->fe[0]->ops.init = af9015_af9013_init; state->sleep[adap->id] = adap->fe[0]->ops.sleep; adap->fe[0]->ops.sleep = af9015_af9013_sleep; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_frontend_detach(struct dvb_usb_adapter *adap) { struct af9015_state *state = adap_to_priv(adap); struct dvb_usb_device *d = adap_to_d(adap); struct usb_interface *intf = d->intf; struct i2c_client *client; dev_dbg(&intf->dev, "adap id %u\n", adap->id); /* Remove I2C demod */ client = state->demod_i2c_client[adap->id]; dvb_module_release(client); return 0; } static struct mt2060_config af9015_mt2060_config = { .i2c_address = 0x60, .clock_out = 0, }; static struct qt1010_config af9015_qt1010_config = { .i2c_address = 0x62, }; static struct tda18271_config af9015_tda18271_config = { .gate = TDA18271_GATE_DIGITAL, .small_i2c = TDA18271_16_BYTE_CHUNK_INIT, }; static struct mxl5005s_config af9015_mxl5003_config = { .i2c_address = 0x63, .if_freq = IF_FREQ_4570000HZ, .xtal_freq = CRYSTAL_FREQ_16000000HZ, .agc_mode = MXL_SINGLE_AGC, .tracking_filter = MXL_TF_DEFAULT, .rssi_enable = MXL_RSSI_ENABLE, .cap_select = MXL_CAP_SEL_ENABLE, .div_out = MXL_DIV_OUT_4, .clock_out = MXL_CLOCK_OUT_DISABLE, .output_load = MXL5005S_IF_OUTPUT_LOAD_200_OHM, .top = MXL5005S_TOP_25P2, .mod_mode = MXL_DIGITAL_MODE, .if_mode = MXL_ZERO_IF, .AgcMasterByte = 0x00, }; static struct mxl5005s_config af9015_mxl5005_config = { .i2c_address = 0x63, .if_freq = IF_FREQ_4570000HZ, .xtal_freq = CRYSTAL_FREQ_16000000HZ, .agc_mode = MXL_SINGLE_AGC, .tracking_filter = MXL_TF_OFF, .rssi_enable = MXL_RSSI_ENABLE, .cap_select = MXL_CAP_SEL_ENABLE, .div_out = MXL_DIV_OUT_4, .clock_out = MXL_CLOCK_OUT_DISABLE, .output_load = MXL5005S_IF_OUTPUT_LOAD_200_OHM, .top = MXL5005S_TOP_25P2, .mod_mode = MXL_DIGITAL_MODE, .if_mode = MXL_ZERO_IF, .AgcMasterByte = 0x00, }; static struct mc44s803_config af9015_mc44s803_config = { .i2c_address = 0x60, .dig_out = 1, }; static struct tda18218_config af9015_tda18218_config = { .i2c_address = 0x60, .i2c_wr_max = 21, /* max wr bytes AF9015 I2C adap can handle at once */ }; static struct mxl5007t_config af9015_mxl5007t_config = { .xtal_freq_hz = MxL_XTAL_24_MHZ, .if_freq_hz = MxL_IF_4_57_MHZ, }; static int af9015_tuner_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; struct i2c_client *client; struct i2c_adapter *adapter; int ret; dev_dbg(&intf->dev, "adap id %u\n", adap->id); client = state->demod_i2c_client[adap->id]; adapter = state->af9013_pdata[adap->id].get_i2c_adapter(client); switch (state->af9013_pdata[adap->id].tuner) { case AF9013_TUNER_MT2060: case AF9013_TUNER_MT2060_2: ret = dvb_attach(mt2060_attach, adap->fe[0], adapter, &af9015_mt2060_config, state->mt2060_if1[adap->id]) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_QT1010: case AF9013_TUNER_QT1010A: ret = dvb_attach(qt1010_attach, adap->fe[0], adapter, &af9015_qt1010_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_TDA18271: ret = dvb_attach(tda18271_attach, adap->fe[0], 0x60, adapter, &af9015_tda18271_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_TDA18218: ret = dvb_attach(tda18218_attach, adap->fe[0], adapter, &af9015_tda18218_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MXL5003D: ret = dvb_attach(mxl5005s_attach, adap->fe[0], adapter, &af9015_mxl5003_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MXL5005D: case AF9013_TUNER_MXL5005R: ret = dvb_attach(mxl5005s_attach, adap->fe[0], adapter, &af9015_mxl5005_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_ENV77H11D5: ret = dvb_attach(dvb_pll_attach, adap->fe[0], 0x60, adapter, DVB_PLL_TDA665X) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MC44S803: ret = dvb_attach(mc44s803_attach, adap->fe[0], adapter, &af9015_mc44s803_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MXL5007T: ret = dvb_attach(mxl5007t_attach, adap->fe[0], adapter, 0x60, &af9015_mxl5007t_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_UNKNOWN: default: dev_err(&intf->dev, "unknown tuner, tuner id %02x\n", state->af9013_pdata[adap->id].tuner); ret = -ENODEV; } if (adap->fe[0]->ops.tuner_ops.init) { state->tuner_init[adap->id] = adap->fe[0]->ops.tuner_ops.init; adap->fe[0]->ops.tuner_ops.init = af9015_tuner_init; } if (adap->fe[0]->ops.tuner_ops.sleep) { state->tuner_sleep[adap->id] = adap->fe[0]->ops.tuner_ops.sleep; adap->fe[0]->ops.tuner_ops.sleep = af9015_tuner_sleep; } return ret; } static int af9015_pid_filter_ctrl(struct dvb_usb_adapter *adap, int onoff) { struct af9015_state *state = adap_to_priv(adap); struct af9013_platform_data *pdata = &state->af9013_pdata[adap->id]; int ret; mutex_lock(&state->fe_mutex); ret = pdata->pid_filter_ctrl(adap->fe[0], onoff); mutex_unlock(&state->fe_mutex); return ret; } static int af9015_pid_filter(struct dvb_usb_adapter *adap, int index, u16 pid, int onoff) { struct af9015_state *state = adap_to_priv(adap); struct af9013_platform_data *pdata = &state->af9013_pdata[adap->id]; int ret; mutex_lock(&state->fe_mutex); ret = pdata->pid_filter(adap->fe[0], index, pid, onoff); mutex_unlock(&state->fe_mutex); return ret; } static int af9015_init(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; dev_dbg(&intf->dev, "\n"); mutex_init(&state->fe_mutex); /* init RC canary */ ret = regmap_write(state->regmap, 0x98e9, 0xff); if (ret) goto error; error: return ret; } #if IS_ENABLED(CONFIG_RC_CORE) struct af9015_rc_setup { unsigned int id; char *rc_codes; }; static char *af9015_rc_setup_match(unsigned int id, const struct af9015_rc_setup *table) { for (; table->rc_codes; table++) if (table->id == id) return table->rc_codes; return NULL; } static const struct af9015_rc_setup af9015_rc_setup_modparam[] = { { AF9015_REMOTE_A_LINK_DTU_M, RC_MAP_ALINK_DTU_M }, { AF9015_REMOTE_MSI_DIGIVOX_MINI_II_V3, RC_MAP_MSI_DIGIVOX_II }, { AF9015_REMOTE_MYGICTV_U718, RC_MAP_TOTAL_MEDIA_IN_HAND }, { AF9015_REMOTE_DIGITTRADE_DVB_T, RC_MAP_DIGITTRADE }, { AF9015_REMOTE_AVERMEDIA_KS, RC_MAP_AVERMEDIA_RM_KS }, { } }; static const struct af9015_rc_setup af9015_rc_setup_hashes[] = { { 0xb8feb708, RC_MAP_MSI_DIGIVOX_II }, { 0xa3703d00, RC_MAP_ALINK_DTU_M }, { 0x9b7dc64e, RC_MAP_TOTAL_MEDIA_IN_HAND }, /* MYGICTV U718 */ { 0x5d49e3db, RC_MAP_DIGITTRADE }, /* LC-Power LC-USB-DVBT */ { } }; static int af9015_rc_query(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; u8 buf[17]; /* read registers needed to detect remote controller code */ ret = regmap_bulk_read(state->regmap, 0x98d9, buf, sizeof(buf)); if (ret) goto error; /* If any of these are non-zero, assume invalid data */ if (buf[1] || buf[2] || buf[3]) { dev_dbg(&intf->dev, "invalid data\n"); return 0; } /* Check for repeat of previous code */ if ((state->rc_repeat != buf[6] || buf[0]) && !memcmp(&buf[12], state->rc_last, 4)) { dev_dbg(&intf->dev, "key repeated\n"); rc_repeat(d->rc_dev); state->rc_repeat = buf[6]; return 0; } /* Only process key if canary killed */ if (buf[16] != 0xff && buf[0] != 0x01) { enum rc_proto proto; dev_dbg(&intf->dev, "key pressed %*ph\n", 4, buf + 12); /* Reset the canary */ ret = regmap_write(state->regmap, 0x98e9, 0xff); if (ret) goto error; /* Remember this key */ memcpy(state->rc_last, &buf[12], 4); if (buf[14] == (u8)~buf[15]) { if (buf[12] == (u8)~buf[13]) { /* NEC */ state->rc_keycode = RC_SCANCODE_NEC(buf[12], buf[14]); proto = RC_PROTO_NEC; } else { /* NEC extended*/ state->rc_keycode = RC_SCANCODE_NECX(buf[12] << 8 | buf[13], buf[14]); proto = RC_PROTO_NECX; } } else { /* 32 bit NEC */ state->rc_keycode = RC_SCANCODE_NEC32(buf[12] << 24 | buf[13] << 16 | buf[14] << 8 | buf[15]); proto = RC_PROTO_NEC32; } rc_keydown(d->rc_dev, proto, state->rc_keycode, 0); } else { dev_dbg(&intf->dev, "no key press\n"); /* Invalidate last keypress */ /* Not really needed, but helps with debug */ state->rc_last[2] = state->rc_last[3]; } state->rc_repeat = buf[6]; state->rc_failed = false; error: if (ret) { dev_warn(&intf->dev, "rc query failed %d\n", ret); /* allow random errors as dvb-usb will stop polling on error */ if (!state->rc_failed) ret = 0; state->rc_failed = true; } return ret; } static int af9015_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc) { struct af9015_state *state = d_to_priv(d); u16 vid = le16_to_cpu(d->udev->descriptor.idVendor); if (state->ir_mode == AF9015_IR_MODE_DISABLED) return 0; /* try to load remote based module param */ if (!rc->map_name) rc->map_name = af9015_rc_setup_match(dvb_usb_af9015_remote, af9015_rc_setup_modparam); /* try to load remote based eeprom hash */ if (!rc->map_name) rc->map_name = af9015_rc_setup_match(state->eeprom_sum, af9015_rc_setup_hashes); /* try to load remote based USB iManufacturer string */ if (!rc->map_name && vid == USB_VID_AFATECH) { /* * Check USB manufacturer and product strings and try * to determine correct remote in case of chip vendor * reference IDs are used. * DO NOT ADD ANYTHING NEW HERE. Use hashes instead. */ char manufacturer[10]; memset(manufacturer, 0, sizeof(manufacturer)); usb_string(d->udev, d->udev->descriptor.iManufacturer, manufacturer, sizeof(manufacturer)); if (!strcmp("MSI", manufacturer)) { /* * iManufacturer 1 MSI * iProduct 2 MSI K-VOX */ rc->map_name = af9015_rc_setup_match(AF9015_REMOTE_MSI_DIGIVOX_MINI_II_V3, af9015_rc_setup_modparam); } } /* load empty to enable rc */ if (!rc->map_name) rc->map_name = RC_MAP_EMPTY; rc->allowed_protos = RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32; rc->query = af9015_rc_query; rc->interval = 500; return 0; } #else #define af9015_get_rc_config NULL #endif static int af9015_regmap_write(void *context, const void *data, size_t count) { struct dvb_usb_device *d = context; struct usb_interface *intf = d->intf; int ret; u16 reg = ((u8 *)data)[0] << 8 | ((u8 *)data)[1] << 0; u8 *val = &((u8 *)data)[2]; const unsigned int len = count - 2; struct req_t req = {WRITE_MEMORY, 0, reg, 0, 0, len, val}; ret = af9015_ctrl_msg(d, &req); if (ret) goto err; return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_regmap_read(void *context, const void *reg_buf, size_t reg_size, void *val_buf, size_t val_size) { struct dvb_usb_device *d = context; struct usb_interface *intf = d->intf; int ret; u16 reg = ((u8 *)reg_buf)[0] << 8 | ((u8 *)reg_buf)[1] << 0; u8 *val = &((u8 *)val_buf)[0]; const unsigned int len = val_size; struct req_t req = {READ_MEMORY, 0, reg, 0, 0, len, val}; ret = af9015_ctrl_msg(d, &req); if (ret) goto err; return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_probe(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; struct usb_device *udev = interface_to_usbdev(intf); int ret; char manufacturer[sizeof("ITE Technologies, Inc.")]; static const struct regmap_config regmap_config = { .reg_bits = 16, .val_bits = 8, }; static const struct regmap_bus regmap_bus = { .read = af9015_regmap_read, .write = af9015_regmap_write, }; dev_dbg(&intf->dev, "\n"); memset(manufacturer, 0, sizeof(manufacturer)); usb_string(udev, udev->descriptor.iManufacturer, manufacturer, sizeof(manufacturer)); /* * There is two devices having same ID but different chipset. One uses * AF9015 and the other IT9135 chipset. Only difference seen on lsusb * is iManufacturer string. * * idVendor 0x0ccd TerraTec Electronic GmbH * idProduct 0x0099 * bcdDevice 2.00 * iManufacturer 1 Afatech * iProduct 2 DVB-T 2 * * idVendor 0x0ccd TerraTec Electronic GmbH * idProduct 0x0099 * bcdDevice 2.00 * iManufacturer 1 ITE Technologies, Inc. * iProduct 2 DVB-T TV Stick */ if ((le16_to_cpu(udev->descriptor.idVendor) == USB_VID_TERRATEC) && (le16_to_cpu(udev->descriptor.idProduct) == 0x0099)) { if (!strcmp("ITE Technologies, Inc.", manufacturer)) { ret = -ENODEV; dev_dbg(&intf->dev, "rejecting device\n"); goto err; } } state->regmap = regmap_init(&intf->dev, ®map_bus, d, ®map_config); if (IS_ERR(state->regmap)) { ret = PTR_ERR(state->regmap); goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static void af9015_disconnect(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; dev_dbg(&intf->dev, "\n"); regmap_exit(state->regmap); } /* * Interface 0 is used by DVB-T receiver and * interface 1 is for remote controller (HID) */ static const struct dvb_usb_device_properties af9015_props = { .driver_name = KBUILD_MODNAME, .owner = THIS_MODULE, .adapter_nr = adapter_nr, .size_of_priv = sizeof(struct af9015_state), .generic_bulk_ctrl_endpoint = 0x02, .generic_bulk_ctrl_endpoint_response = 0x81, .probe = af9015_probe, .disconnect = af9015_disconnect, .identify_state = af9015_identify_state, .firmware = AF9015_FIRMWARE, .download_firmware = af9015_download_firmware, .i2c_algo = &af9015_i2c_algo, .read_config = af9015_read_config, .frontend_attach = af9015_af9013_frontend_attach, .frontend_detach = af9015_frontend_detach, .tuner_attach = af9015_tuner_attach, .init = af9015_init, .get_rc_config = af9015_get_rc_config, .get_stream_config = af9015_get_stream_config, .streaming_ctrl = af9015_streaming_ctrl, .get_adapter_count = af9015_get_adapter_count, .adapter = { { .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 32, .pid_filter = af9015_pid_filter, .pid_filter_ctrl = af9015_pid_filter_ctrl, .stream = DVB_USB_STREAM_BULK(0x84, 6, 87 * 188), }, { .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 32, .pid_filter = af9015_pid_filter, .pid_filter_ctrl = af9015_pid_filter_ctrl, .stream = DVB_USB_STREAM_BULK(0x85, 6, 87 * 188), }, }, }; static const struct usb_device_id af9015_id_table[] = { { DVB_USB_DEVICE(USB_VID_AFATECH, USB_PID_AFATECH_AF9015_9015, &af9015_props, "Afatech AF9015 reference design", NULL) }, { DVB_USB_DEVICE(USB_VID_AFATECH, USB_PID_AFATECH_AF9015_9016, &af9015_props, "Afatech AF9015 reference design", NULL) }, { DVB_USB_DEVICE(USB_VID_LEADTEK, USB_PID_WINFAST_DTV_DONGLE_GOLD, &af9015_props, "Leadtek WinFast DTV Dongle Gold", RC_MAP_LEADTEK_Y04G0051) }, { DVB_USB_DEVICE(USB_VID_PINNACLE, USB_PID_PINNACLE_PCTV71E, &af9015_props, "Pinnacle PCTV 71e", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_399U, &af9015_props, "KWorld PlusTV Dual DVB-T Stick (DVB-T 399U)", NULL) }, { DVB_USB_DEVICE(USB_VID_VISIONPLUS, USB_PID_TINYTWIN, &af9015_props, "DigitalNow TinyTwin", RC_MAP_AZUREWAVE_AD_TU700) }, { DVB_USB_DEVICE(USB_VID_VISIONPLUS, USB_PID_AZUREWAVE_AD_TU700, &af9015_props, "TwinHan AzureWave AD-TU700(704J)", RC_MAP_AZUREWAVE_AD_TU700) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_T_USB_XE_REV2, &af9015_props, "TerraTec Cinergy T USB XE", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_PC160_2T, &af9015_props, "KWorld PlusTV Dual DVB-T PCI (DVB-T PC160-2T)", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_VOLAR_X, &af9015_props, "AVerMedia AVerTV DVB-T Volar X", RC_MAP_AVERMEDIA_M135A) }, { DVB_USB_DEVICE(USB_VID_XTENSIONS, USB_PID_XTENSIONS_XD_380, &af9015_props, "Xtensions XD-380", NULL) }, { DVB_USB_DEVICE(USB_VID_MSI_2, USB_PID_MSI_DIGIVOX_DUO, &af9015_props, "MSI DIGIVOX Duo", RC_MAP_MSI_DIGIVOX_III) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_VOLAR_X_2, &af9015_props, "Fujitsu-Siemens Slim Mobile USB DVB-T", NULL) }, { DVB_USB_DEVICE(USB_VID_TELESTAR, USB_PID_TELESTAR_STARSTICK_2, &af9015_props, "Telestar Starstick 2", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A309, &af9015_props, "AVerMedia A309", NULL) }, { DVB_USB_DEVICE(USB_VID_MSI_2, USB_PID_MSI_DIGI_VOX_MINI_III, &af9015_props, "MSI Digi VOX mini III", RC_MAP_MSI_DIGIVOX_III) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U_2, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U_3, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_AFATECH, USB_PID_TREKSTOR_DVBT, &af9015_props, "TrekStor DVB-T USB Stick", RC_MAP_TREKSTOR) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A850, &af9015_props, "AverMedia AVerTV Volar Black HD (A850)", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A805, &af9015_props, "AverMedia AVerTV Volar GPS 805 (A805)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_CONCEPTRONIC_CTVDIGRCU, &af9015_props, "Conceptronic USB2.0 DVB-T CTVDIGRCU V3.0", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_MC810, &af9015_props, "KWorld Digital MC-810", NULL) }, { DVB_USB_DEVICE(USB_VID_KYE, USB_PID_GENIUS_TVGO_DVB_T03, &af9015_props, "Genius TVGo DVB-T03", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_399U_2, &af9015_props, "KWorld PlusTV Dual DVB-T Stick (DVB-T 399U)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_PC160_T, &af9015_props, "KWorld PlusTV DVB-T PCI Pro Card (DVB-T PC160-T)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_SVEON_STV20, &af9015_props, "Sveon STV20 Tuner USB DVB-T HDTV", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_TINYTWIN_2, &af9015_props, "DigitalNow TinyTwin v2", RC_MAP_DIGITALNOW_TINYTWIN) }, { DVB_USB_DEVICE(USB_VID_LEADTEK, USB_PID_WINFAST_DTV2000DS, &af9015_props, "Leadtek WinFast DTV2000DS", RC_MAP_LEADTEK_Y04G0051) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_UB383_T, &af9015_props, "KWorld USB DVB-T Stick Mobile (UB383-T)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U_4, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A815M, &af9015_props, "AverMedia AVerTV Volar M (A815Mac)", NULL) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_T_STICK_RC, &af9015_props, "TerraTec Cinergy T Stick RC", RC_MAP_TERRATEC_SLIM_2) }, /* XXX: that same ID [0ccd:0099] is used by af9035 driver too */ { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_T_STICK_DUAL_RC, &af9015_props, "TerraTec Cinergy T Stick Dual RC", RC_MAP_TERRATEC_SLIM) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A850T, &af9015_props, "AverMedia AVerTV Red HD+ (A850T)", NULL) }, { DVB_USB_DEVICE(USB_VID_GTEK, USB_PID_TINYTWIN_3, &af9015_props, "DigitalNow TinyTwin v3", RC_MAP_DIGITALNOW_TINYTWIN) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_SVEON_STV22, &af9015_props, "Sveon STV22 Dual USB DVB-T Tuner HDTV", RC_MAP_MSI_DIGIVOX_III) }, { } }; MODULE_DEVICE_TABLE(usb, af9015_id_table); /* usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver af9015_usb_driver = { .name = KBUILD_MODNAME, .id_table = af9015_id_table, .probe = dvb_usbv2_probe, .disconnect = dvb_usbv2_disconnect, .suspend = dvb_usbv2_suspend, .resume = dvb_usbv2_resume, .reset_resume = dvb_usbv2_reset_resume, .no_dynamic_id = 1, .soft_unbind = 1, }; module_usb_driver(af9015_usb_driver); MODULE_AUTHOR("Antti Palosaari <crope@iki.fi>"); MODULE_DESCRIPTION("Afatech AF9015 driver"); MODULE_LICENSE("GPL"); MODULE_FIRMWARE(AF9015_FIRMWARE); |
4 4 3 4 4 7 9 1 22 2 2 1 3 10 2 10 2 9 3 8 4 7 1 7 1 2 2 4 6 2 9 2 7 2 2 8 3 3 5 5 4 1 6 1 4 5 1 4 1 5 5 5 6 6 5 5 7 5 2 5 2 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016, Amir Vadai <amir@vadai.me> * Copyright (c) 2016, Mellanox Technologies. All rights reserved. */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/geneve.h> #include <net/vxlan.h> #include <net/erspan.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/dst.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/tc_act/tc_tunnel_key.h> #include <net/tc_act/tc_tunnel_key.h> static struct tc_action_ops act_tunnel_key_ops; TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; int action; params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); tcf_action_update_bstats(&t->common, skb); action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: skb_dst_drop(skb); break; case TCA_TUNNEL_KEY_ACT_SET: skb_dst_drop(skb); skb_dst_set(skb, dst_clone(¶ms->tcft_enc_metadata->dst)); break; default: WARN_ONCE(1, "Bad tunnel_key action %d.\n", params->tcft_action); break; } return action; } static const struct nla_policy enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC] = { .strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN }, [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, [TCA_TUNNEL_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED }, [TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED }, }; static const struct nla_policy geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, }; static const struct nla_policy vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 }, }; static const struct nla_policy erspan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; static int tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1]; int err, data_len, opt_len; u8 *data; err = nla_parse_nested_deprecated(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX, nla, geneve_opt_policy, extack); if (err < 0) return err; if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] || !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] || !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) { NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data"); return -EINVAL; } data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); if (data_len < 4) { NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long"); return -ERANGE; } if (data_len % 4) { NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long"); return -ERANGE; } opt_len = sizeof(struct geneve_opt) + data_len; if (dst) { struct geneve_opt *opt = dst; WARN_ON(dst_len < opt_len); opt->opt_class = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]); opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]); opt->length = data_len / 4; /* length is in units of 4 bytes */ opt->r1 = 0; opt->r2 = 0; opt->r3 = 0; memcpy(opt + 1, data, data_len); } return opt_len; } static int tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1]; int err; err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, nla, vxlan_opt_policy, extack); if (err < 0) return err; if (!tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]) { NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp"); return -EINVAL; } if (dst) { struct vxlan_metadata *md = dst; md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]); md->gbp &= VXLAN_GBP_MASK; } return sizeof(struct vxlan_metadata); } static int tunnel_key_copy_erspan_opt(const struct nlattr *nla, void *dst, int dst_len, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1]; int err; u8 ver; err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, nla, erspan_opt_policy, extack); if (err < 0) return err; if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]) { NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver"); return -EINVAL; } ver = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]); if (ver == 1) { if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]) { NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index"); return -EINVAL; } } else if (ver == 2) { if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] || !tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]) { NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid"); return -EINVAL; } } else { NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect"); return -EINVAL; } if (dst) { struct erspan_metadata *md = dst; md->version = ver; if (ver == 1) { nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]; md->u.index = nla_get_be32(nla); } else { nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR]; md->u.md2.dir = nla_get_u8(nla); nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]; set_hwid(&md->u.md2, nla_get_u8(nla)); } } return sizeof(struct erspan_metadata); } static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, int dst_len, struct netlink_ext_ack *extack) { int err, rem, opt_len, len = nla_len(nla), opts_len = 0, type = 0; const struct nlattr *attr, *head = nla_data(nla); err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX, enc_opts_policy, extack); if (err) return err; nla_for_each_attr(attr, head, len, rem) { switch (nla_type(attr)) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: if (type && type != TUNNEL_GENEVE_OPT) { NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); return -EINVAL; } opt_len = tunnel_key_copy_geneve_opt(attr, dst, dst_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; if (opts_len > IP_TUNNEL_OPTS_MAX) { NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size"); return -EINVAL; } if (dst) { dst_len -= opt_len; dst += opt_len; } type = TUNNEL_GENEVE_OPT; break; case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: if (type) { NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options"); return -EINVAL; } opt_len = tunnel_key_copy_vxlan_opt(attr, dst, dst_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; type = TUNNEL_VXLAN_OPT; break; case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: if (type) { NL_SET_ERR_MSG(extack, "Duplicate type for erspan options"); return -EINVAL; } opt_len = tunnel_key_copy_erspan_opt(attr, dst, dst_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; type = TUNNEL_ERSPAN_OPT; break; } } if (!opts_len) { NL_SET_ERR_MSG(extack, "Empty list of tunnel options"); return -EINVAL; } if (rem > 0) { NL_SET_ERR_MSG(extack, "Trailing data after parsing tunnel key options attributes"); return -EINVAL; } return opts_len; } static int tunnel_key_get_opts_len(struct nlattr *nla, struct netlink_ext_ack *extack) { return tunnel_key_copy_opts(nla, NULL, 0, extack); } static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, int opts_len, struct netlink_ext_ack *extack) { info->options_len = opts_len; switch (nla_type(nla_data(nla))) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: #if IS_ENABLED(CONFIG_INET) info->key.tun_flags |= TUNNEL_GENEVE_OPT; return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else return -EAFNOSUPPORT; #endif case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: #if IS_ENABLED(CONFIG_INET) info->key.tun_flags |= TUNNEL_VXLAN_OPT; return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else return -EAFNOSUPPORT; #endif case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: #if IS_ENABLED(CONFIG_INET) info->key.tun_flags |= TUNNEL_ERSPAN_OPT; return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else return -EAFNOSUPPORT; #endif default: NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type"); return -EINVAL; } } static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) }, [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) }, [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16}, [TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED }, [TCA_TUNNEL_KEY_ENC_TOS] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 }, }; static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) { if (!p) return; if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET) dst_release(&p->tcft_enc_metadata->dst); kfree_rcu(p, rcu); } static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_tunnel_key_ops.net_id); bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; struct tcf_tunnel_key_params *params_new; struct metadata_dst *metadata = NULL; struct tcf_chain *goto_ch = NULL; struct tc_tunnel_key *parm; struct tcf_tunnel_key *t; bool exists = false; __be16 dst_port = 0; __be64 key_id = 0; int opts_len = 0; __be16 flags = 0; u8 tos, ttl; int ret = 0; u32 index; int err; if (!nla) { NL_SET_ERR_MSG(extack, "Tunnel requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes"); return err; } if (!tb[TCA_TUNNEL_KEY_PARMS]) { NL_SET_ERR_MSG(extack, "Missing tunnel key parameters"); return -EINVAL; } parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; switch (parm->t_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: break; case TCA_TUNNEL_KEY_ACT_SET: if (tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { __be32 key32; key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]); key_id = key32_to_tunnel_id(key32); flags = TUNNEL_KEY; } flags |= TUNNEL_CSUM; if (tb[TCA_TUNNEL_KEY_NO_CSUM] && nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM])) flags &= ~TUNNEL_CSUM; if (nla_get_flag(tb[TCA_TUNNEL_KEY_NO_FRAG])) flags |= TUNNEL_DONT_FRAGMENT; if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT]) dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); if (tb[TCA_TUNNEL_KEY_ENC_OPTS]) { opts_len = tunnel_key_get_opts_len(tb[TCA_TUNNEL_KEY_ENC_OPTS], extack); if (opts_len < 0) { ret = opts_len; goto err_out; } } tos = 0; if (tb[TCA_TUNNEL_KEY_ENC_TOS]) tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]); ttl = 0; if (tb[TCA_TUNNEL_KEY_ENC_TTL]) ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]); if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { __be32 saddr; __be32 daddr; saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]); daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]); metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl, dst_port, flags, key_id, opts_len); } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) { struct in6_addr saddr; struct in6_addr daddr; saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]); daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port, 0, flags, key_id, opts_len); } else { NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst"); ret = -EINVAL; goto err_out; } if (!metadata) { NL_SET_ERR_MSG(extack, "Cannot allocate tunnel metadata dst"); ret = -ENOMEM; goto err_out; } #ifdef CONFIG_DST_CACHE ret = dst_cache_init(&metadata->u.tun_info.dst_cache, GFP_KERNEL); if (ret) goto release_tun_meta; #endif if (opts_len) { ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS], &metadata->u.tun_info, opts_len, extack); if (ret < 0) goto release_tun_meta; } metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel key action"); ret = -EINVAL; goto err_out; } if (!exists) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_tunnel_key_ops, bind, act_flags); if (ret) { NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); goto release_tun_meta; } ret = ACT_P_CREATED; } else if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "TC IDR already exists"); ret = -EEXIST; goto release_tun_meta; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) { ret = err; exists = true; goto release_tun_meta; } t = to_tunnel_key(*a); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); ret = -ENOMEM; exists = true; goto put_chain; } params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; spin_lock_bh(&t->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(t->params, params_new, lockdep_is_held(&t->tcf_lock)); spin_unlock_bh(&t->tcf_lock); tunnel_key_release_params(params_new); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_tun_meta: if (metadata) dst_release(&metadata->dst); err_out: if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return ret; } static void tunnel_key_release(struct tc_action *a) { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, 1); tunnel_key_release_params(params); } static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { int len = info->options_len; u8 *src = (u8 *)(info + 1); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); if (!start) return -EMSGSIZE; while (len > 0) { struct geneve_opt *opt = (struct geneve_opt *)src; if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, opt->opt_class) || nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE, opt->type) || nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA, opt->length * 4, opt + 1)) { nla_nest_cancel(skb, start); return -EMSGSIZE; } len -= sizeof(struct geneve_opt) + opt->length * 4; src += sizeof(struct geneve_opt) + opt->length * 4; } nla_nest_end(skb, start); return 0; } static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN); if (!start) return -EMSGSIZE; if (nla_put_u32(skb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) { nla_nest_cancel(skb, start); return -EMSGSIZE; } nla_nest_end(skb, start); return 0; } static int tunnel_key_erspan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { struct erspan_metadata *md = (struct erspan_metadata *)(info + 1); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN); if (!start) return -EMSGSIZE; if (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, md->version)) goto err; if (md->version == 1 && nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index)) goto err; if (md->version == 2 && (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR, md->u.md2.dir) || nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID, get_hwid(&md->u.md2)))) goto err; nla_nest_end(skb, start); return 0; err: nla_nest_cancel(skb, start); return -EMSGSIZE; } static int tunnel_key_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { struct nlattr *start; int err = -EINVAL; if (!info->options_len) return 0; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS); if (!start) return -EMSGSIZE; if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { err = tunnel_key_geneve_opts_dump(skb, info); if (err) goto err_out; } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { err = tunnel_key_vxlan_opts_dump(skb, info); if (err) goto err_out; } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { err = tunnel_key_erspan_opts_dump(skb, info); if (err) goto err_out; } else { err_out: nla_nest_cancel(skb, start); return err; } nla_nest_end(skb, start); return 0; } static int tunnel_key_dump_addresses(struct sk_buff *skb, const struct ip_tunnel_info *info) { unsigned short family = ip_tunnel_info_af(info); if (family == AF_INET) { __be32 saddr = info->key.u.ipv4.src; __be32 daddr = info->key.u.ipv4.dst; if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) && !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr)) return 0; } if (family == AF_INET6) { const struct in6_addr *saddr6 = &info->key.u.ipv6.src; const struct in6_addr *daddr6 = &info->key.u.ipv6.dst; if (!nla_put_in6_addr(skb, TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) && !nla_put_in6_addr(skb, TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6)) return 0; } return -EINVAL; } static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; struct tc_tunnel_key opt = { .index = t->tcf_index, .refcnt = refcount_read(&t->tcf_refcnt) - ref, .bindcnt = atomic_read(&t->tcf_bindcnt) - bind, }; struct tcf_t tm; spin_lock_bh(&t->tcf_lock); params = rcu_dereference_protected(t->params, lockdep_is_held(&t->tcf_lock)); opt.action = t->tcf_action; opt.t_action = params->tcft_action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) { struct ip_tunnel_info *info = ¶ms->tcft_enc_metadata->u.tun_info; struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); if (((key->tun_flags & TUNNEL_KEY) && nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) || tunnel_key_dump_addresses(skb, ¶ms->tcft_enc_metadata->u.tun_info) || (key->tp_dst && nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst)) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, !(key->tun_flags & TUNNEL_CSUM)) || ((key->tun_flags & TUNNEL_DONT_FRAGMENT) && nla_put_flag(skb, TCA_TUNNEL_KEY_NO_FRAG)) || tunnel_key_opts_dump(skb, info)) goto nla_put_failure; if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos)) goto nla_put_failure; if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl)) goto nla_put_failure; } tcf_tm_dump(&tm, &t->tcf_tm); if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), &tm, TCA_TUNNEL_KEY_PAD)) goto nla_put_failure; spin_unlock_bh(&t->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&t->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_tunnel_encap_put_tunnel(void *priv) { struct ip_tunnel_info *tunnel = priv; kfree(tunnel); } static int tcf_tunnel_encap_get_tunnel(struct flow_action_entry *entry, const struct tc_action *act) { entry->tunnel = tcf_tunnel_info_copy(act); if (!entry->tunnel) return -ENOMEM; entry->destructor = tcf_tunnel_encap_put_tunnel; entry->destructor_priv = entry->tunnel; return 0; } static int tcf_tunnel_key_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { int err; if (bind) { struct flow_action_entry *entry = entry_data; if (is_tcf_tunnel_set(act)) { entry->id = FLOW_ACTION_TUNNEL_ENCAP; err = tcf_tunnel_encap_get_tunnel(entry, act); if (err) return err; } else if (is_tcf_tunnel_release(act)) { entry->id = FLOW_ACTION_TUNNEL_DECAP; } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel key mode offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; if (is_tcf_tunnel_set(act)) fl_action->id = FLOW_ACTION_TUNNEL_ENCAP; else if (is_tcf_tunnel_release(act)) fl_action->id = FLOW_ACTION_TUNNEL_DECAP; else return -EOPNOTSUPP; } return 0; } static struct tc_action_ops act_tunnel_key_ops = { .kind = "tunnel_key", .id = TCA_ID_TUNNEL_KEY, .owner = THIS_MODULE, .act = tunnel_key_act, .dump = tunnel_key_dump, .init = tunnel_key_init, .cleanup = tunnel_key_release, .offload_act_setup = tcf_tunnel_key_offload_act_setup, .size = sizeof(struct tcf_tunnel_key), }; MODULE_ALIAS_NET_ACT("tunnel_key"); static __net_init int tunnel_key_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_tunnel_key_ops.net_id); return tc_action_net_init(net, tn, &act_tunnel_key_ops); } static void __net_exit tunnel_key_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_tunnel_key_ops.net_id); } static struct pernet_operations tunnel_key_net_ops = { .init = tunnel_key_init_net, .exit_batch = tunnel_key_exit_net, .id = &act_tunnel_key_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init tunnel_key_init_module(void) { return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops); } static void __exit tunnel_key_cleanup_module(void) { tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops); } module_init(tunnel_key_init_module); module_exit(tunnel_key_cleanup_module); MODULE_AUTHOR("Amir Vadai <amir@vadai.me>"); MODULE_DESCRIPTION("ip tunnel manipulation actions"); MODULE_LICENSE("GPL v2"); |
13 16 16 7 1 13 7 13 13 16 16 7 2 2 13 3 13 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/gen_stats.h> #include <linux/jhash.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/slab.h> #include <net/gen_stats.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_RATEEST.h> #include <net/netfilter/xt_rateest.h> #define RATEEST_HSIZE 16 struct xt_rateest_net { struct mutex hash_lock; struct hlist_head hash[RATEEST_HSIZE]; }; static unsigned int xt_rateest_id; static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) { return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) & (RATEEST_HSIZE - 1); } static void xt_rateest_hash_insert(struct xt_rateest_net *xn, struct xt_rateest *est) { unsigned int h; h = xt_rateest_hash(est->name); hlist_add_head(&est->list, &xn->hash[h]); } static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn, const char *name) { struct xt_rateest *est; unsigned int h; h = xt_rateest_hash(name); hlist_for_each_entry(est, &xn->hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; return est; } } return NULL; } struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); struct xt_rateest *est; mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, name); mutex_unlock(&xn->hash_lock); return est; } EXPORT_SYMBOL_GPL(xt_rateest_lookup); void xt_rateest_put(struct net *net, struct xt_rateest *est) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); mutex_lock(&xn->hash_lock); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->rate_est); /* * gen_estimator est_timer() might access est->lock or bstats, * wait a RCU grace period before freeing 'est' */ kfree_rcu(est, rcu); } mutex_unlock(&xn->hash_lock); } EXPORT_SYMBOL_GPL(xt_rateest_put); static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); u64_stats_add(&stats->bytes, skb->len); u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; } static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id); struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { struct nlattr opt; struct gnet_estimator est; } cfg; int ret; if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) return -ENAMETOOLONG; net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, info->name); if (est) { mutex_unlock(&xn->hash_lock); /* * If estimator parameters are specified, they must match the * existing estimator. */ if ((!info->interval && !info->ewma_log) || (info->interval != est->params.interval || info->ewma_log != est->params.ewma_log)) { xt_rateest_put(par->net, est); return -EINVAL; } info->est = est; return 0; } ret = -ENOMEM; est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) goto err1; gnet_stats_basic_sync_init(&est->bstats); strscpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; est->params.interval = info->interval; est->params.ewma_log = info->ewma_log; cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est)); cfg.opt.nla_type = TCA_STATS_RATE_EST; cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est, &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; info->est = est; xt_rateest_hash_insert(xn, est); mutex_unlock(&xn->hash_lock); return 0; err2: kfree(est); err1: mutex_unlock(&xn->hash_lock); return ret; } static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { struct xt_rateest_target_info *info = par->targinfo; xt_rateest_put(par->net, info->est); } static struct xt_target xt_rateest_tg_reg __read_mostly = { .name = "RATEEST", .revision = 0, .family = NFPROTO_UNSPEC, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }; static __net_init int xt_rateest_net_init(struct net *net) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); int i; mutex_init(&xn->hash_lock); for (i = 0; i < ARRAY_SIZE(xn->hash); i++) INIT_HLIST_HEAD(&xn->hash[i]); return 0; } static struct pernet_operations xt_rateest_net_ops = { .init = xt_rateest_net_init, .id = &xt_rateest_id, .size = sizeof(struct xt_rateest_net), }; static int __init xt_rateest_tg_init(void) { int err = register_pernet_subsys(&xt_rateest_net_ops); if (err) return err; return xt_register_target(&xt_rateest_tg_reg); } static void __exit xt_rateest_tg_fini(void) { xt_unregister_target(&xt_rateest_tg_reg); unregister_pernet_subsys(&xt_rateest_net_ops); } MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: packet rate estimator"); MODULE_ALIAS("ipt_RATEEST"); MODULE_ALIAS("ip6t_RATEEST"); module_init(xt_rateest_tg_init); module_exit(xt_rateest_tg_fini); |
47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vsock #if !defined(_TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H) || \ defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H #include <linux/tracepoint.h> TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_STREAM); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_SEQPACKET); #define show_type(val) \ __print_symbolic(val, \ { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" }, \ { VIRTIO_VSOCK_TYPE_SEQPACKET, "SEQPACKET" }) TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_INVALID); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_REQUEST); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RESPONSE); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RST); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_SHUTDOWN); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RW); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_UPDATE); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_REQUEST); #define show_op(val) \ __print_symbolic(val, \ { VIRTIO_VSOCK_OP_INVALID, "INVALID" }, \ { VIRTIO_VSOCK_OP_REQUEST, "REQUEST" }, \ { VIRTIO_VSOCK_OP_RESPONSE, "RESPONSE" }, \ { VIRTIO_VSOCK_OP_RST, "RST" }, \ { VIRTIO_VSOCK_OP_SHUTDOWN, "SHUTDOWN" }, \ { VIRTIO_VSOCK_OP_RW, "RW" }, \ { VIRTIO_VSOCK_OP_CREDIT_UPDATE, "CREDIT_UPDATE" }, \ { VIRTIO_VSOCK_OP_CREDIT_REQUEST, "CREDIT_REQUEST" }) TRACE_EVENT(virtio_transport_alloc_pkt, TP_PROTO( __u32 src_cid, __u32 src_port, __u32 dst_cid, __u32 dst_port, __u32 len, __u16 type, __u16 op, __u32 flags, bool zcopy ), TP_ARGS( src_cid, src_port, dst_cid, dst_port, len, type, op, flags, zcopy ), TP_STRUCT__entry( __field(__u32, src_cid) __field(__u32, src_port) __field(__u32, dst_cid) __field(__u32, dst_port) __field(__u32, len) __field(__u16, type) __field(__u16, op) __field(__u32, flags) __field(bool, zcopy) ), TP_fast_assign( __entry->src_cid = src_cid; __entry->src_port = src_port; __entry->dst_cid = dst_cid; __entry->dst_port = dst_port; __entry->len = len; __entry->type = type; __entry->op = op; __entry->flags = flags; __entry->zcopy = zcopy; ), TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x zcopy=%s", __entry->src_cid, __entry->src_port, __entry->dst_cid, __entry->dst_port, __entry->len, show_type(__entry->type), show_op(__entry->op), __entry->flags, __entry->zcopy ? "true" : "false") ); TRACE_EVENT(virtio_transport_recv_pkt, TP_PROTO( __u32 src_cid, __u32 src_port, __u32 dst_cid, __u32 dst_port, __u32 len, __u16 type, __u16 op, __u32 flags, __u32 buf_alloc, __u32 fwd_cnt ), TP_ARGS( src_cid, src_port, dst_cid, dst_port, len, type, op, flags, buf_alloc, fwd_cnt ), TP_STRUCT__entry( __field(__u32, src_cid) __field(__u32, src_port) __field(__u32, dst_cid) __field(__u32, dst_port) __field(__u32, len) __field(__u16, type) __field(__u16, op) __field(__u32, flags) __field(__u32, buf_alloc) __field(__u32, fwd_cnt) ), TP_fast_assign( __entry->src_cid = src_cid; __entry->src_port = src_port; __entry->dst_cid = dst_cid; __entry->dst_port = dst_port; __entry->len = len; __entry->type = type; __entry->op = op; __entry->flags = flags; __entry->buf_alloc = buf_alloc; __entry->fwd_cnt = fwd_cnt; ), TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x " "buf_alloc=%u fwd_cnt=%u", __entry->src_cid, __entry->src_port, __entry->dst_cid, __entry->dst_port, __entry->len, show_type(__entry->type), show_op(__entry->op), __entry->flags, __entry->buf_alloc, __entry->fwd_cnt) ); #endif /* _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H */ #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE vsock_virtio_transport_common /* This part must be outside protection */ #include <trace/define_trace.h> |
19 13 18 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | /* * linux/fs/nls/nls_cp869.c * * Charset cp869 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0386, 0x0000, 0x00b7, 0x00ac, 0x00a6, 0x2018, 0x2019, 0x0388, 0x2015, 0x0389, /* 0x90*/ 0x038a, 0x03aa, 0x038c, 0x0000, 0x0000, 0x038e, 0x03ab, 0x00a9, 0x038f, 0x00b2, 0x00b3, 0x03ac, 0x00a3, 0x03ad, 0x03ae, 0x03af, /* 0xa0*/ 0x03ca, 0x0390, 0x03cc, 0x03cd, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x00bd, 0x0398, 0x0399, 0x00ab, 0x00bb, /* 0xb0*/ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x039a, 0x039b, 0x039c, 0x039d, 0x2563, 0x2551, 0x2557, 0x255d, 0x039e, 0x039f, 0x2510, /* 0xc0*/ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x03a0, 0x03a1, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x03a3, /* 0xd0*/ 0x03a4, 0x03a5, 0x03a6, 0x03a7, 0x03a8, 0x03a9, 0x03b1, 0x03b2, 0x03b3, 0x2518, 0x250c, 0x2588, 0x2584, 0x03b4, 0x03b5, 0x2580, /* 0xe0*/ 0x03b6, 0x03b7, 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, 0x03c0, 0x03c1, 0x03c3, 0x03c2, 0x03c4, 0x0384, /* 0xf0*/ 0x00ad, 0x00b1, 0x03c5, 0x03c6, 0x03c7, 0x00a7, 0x03c8, 0x0385, 0x00b0, 0x00a8, 0x03c9, 0x03cb, 0x03b0, 0x03ce, 0x25a0, 0x00a0, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xff, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x8a, 0xf5, /* 0xa0-0xa7 */ 0xf9, 0x97, 0x00, 0xae, 0x89, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */ 0xf8, 0xf1, 0x99, 0x9a, 0x00, 0x00, 0x00, 0x88, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0xaf, 0x00, 0xab, 0x00, 0x00, /* 0xb8-0xbf */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0xef, 0xf7, 0x86, 0x00, /* 0x80-0x87 */ 0x8d, 0x8f, 0x90, 0x00, 0x92, 0x00, 0x95, 0x98, /* 0x88-0x8f */ 0xa1, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, /* 0x90-0x97 */ 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, /* 0x98-0x9f */ 0xc6, 0xc7, 0x00, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, /* 0xa0-0xa7 */ 0xd4, 0xd5, 0x91, 0x96, 0x9b, 0x9d, 0x9e, 0x9f, /* 0xa8-0xaf */ 0xfc, 0xd6, 0xd7, 0xd8, 0xdd, 0xde, 0xe0, 0xe1, /* 0xb0-0xb7 */ 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, /* 0xb8-0xbf */ 0xea, 0xeb, 0xed, 0xec, 0xee, 0xf2, 0xf3, 0xf4, /* 0xc0-0xc7 */ 0xf6, 0xfa, 0xa0, 0xfb, 0xa2, 0xa3, 0xfd, 0x00, /* 0xc8-0xcf */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, /* 0x10-0x17 */ 0x8b, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ }; static const unsigned char page25[256] = { 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, NULL, NULL, NULL, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9b, 0x00, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x9d, 0x8e, 0x9e, /* 0x88-0x8f */ 0x9f, 0xa0, 0xa2, 0x00, 0x00, 0xa3, 0xfb, 0x97, /* 0x90-0x97 */ 0xfd, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xd6, 0xd7, 0xd8, 0xdd, /* 0xa0-0xa7 */ 0xde, 0xe0, 0xe1, 0xab, 0xe2, 0xe3, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xe4, 0xe5, 0xe6, /* 0xb0-0xb7 */ 0xe7, 0xb9, 0xba, 0xbb, 0xbc, 0xe8, 0xe9, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xea, 0xeb, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xec, /* 0xc8-0xcf */ 0xee, 0xf2, 0xf3, 0xf4, 0xf6, 0xfa, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x00, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x86, 0x9c, 0x8d, 0x8f, 0x90, /* 0x98-0x9f */ 0x91, 0xa1, 0x92, 0x95, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xa4, 0xa5, /* 0xd0-0xd7 */ 0xa6, 0xd9, 0xda, 0xdb, 0xdc, 0xa7, 0xa8, 0xdf, /* 0xd8-0xdf */ 0xa9, 0xaa, 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, /* 0xe0-0xe7 */ 0xbd, 0xbe, 0xc6, 0xc7, 0xcf, 0xcf, 0xd0, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xd1, 0xd2, 0xd3, 0xf5, 0xd4, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xd5, 0x96, 0xfc, 0x98, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp869", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp869(void) { return register_nls(&table); } static void __exit exit_nls_cp869(void) { unregister_nls(&table); } module_init(init_nls_cp869) module_exit(exit_nls_cp869) MODULE_LICENSE("Dual BSD/GPL"); |
1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 | // SPDX-License-Identifier: GPL-2.0-only /* DVB USB compliant linux driver for Conexant USB reference design. * * The Conexant reference design I saw on their website was only for analogue * capturing (using the cx25842). The box I took to write this driver (reverse * engineered) is the one labeled Medion MD95700. In addition to the cx25842 * for analogue capturing it also has a cx22702 DVB-T demodulator on the main * board. Besides it has a atiremote (X10) and a USB2.0 hub onboard. * * Maybe it is a little bit premature to call this driver cxusb, but I assume * the USB protocol is identical or at least inherited from the reference * design, so it can be reused for the "analogue-only" device (if it will * appear at all). * * * Copyright (C) 2005 Patrick Boettcher (patrick.boettcher@posteo.de) * Copyright (C) 2006 Michael Krufky (mkrufky@linuxtv.org) * Copyright (C) 2006, 2007 Chris Pascoe (c.pascoe@itee.uq.edu.au) * Copyright (C) 2011, 2017 Maciej S. Szmigiero (mail@maciej.szmigiero.name) * * see Documentation/driver-api/media/drivers/dvb-usb.rst for more information */ #include <media/tuner.h> #include <linux/delay.h> #include <linux/device.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/vmalloc.h> #include "cxusb.h" #include "cx22702.h" #include "lgdt330x.h" #include "mt352.h" #include "mt352_priv.h" #include "zl10353.h" #include "xc2028.h" #include "tuner-simple.h" #include "mxl5005s.h" #include "max2165.h" #include "dib7000p.h" #include "dib0070.h" #include "lgs8gxx.h" #include "atbm8830.h" #include "si2168.h" #include "si2157.h" /* debug */ int dvb_usb_cxusb_debug; module_param_named(debug, dvb_usb_cxusb_debug, int, 0644); MODULE_PARM_DESC(debug, "set debugging level (see cxusb.h)." DVB_USB_DEBUG_STATUS); DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); enum cxusb_table_index { MEDION_MD95700, DVICO_BLUEBIRD_LG064F_COLD, DVICO_BLUEBIRD_LG064F_WARM, DVICO_BLUEBIRD_DUAL_1_COLD, DVICO_BLUEBIRD_DUAL_1_WARM, DVICO_BLUEBIRD_LGZ201_COLD, DVICO_BLUEBIRD_LGZ201_WARM, DVICO_BLUEBIRD_TH7579_COLD, DVICO_BLUEBIRD_TH7579_WARM, DIGITALNOW_BLUEBIRD_DUAL_1_COLD, DIGITALNOW_BLUEBIRD_DUAL_1_WARM, DVICO_BLUEBIRD_DUAL_2_COLD, DVICO_BLUEBIRD_DUAL_2_WARM, DVICO_BLUEBIRD_DUAL_4, DVICO_BLUEBIRD_DVB_T_NANO_2, DVICO_BLUEBIRD_DVB_T_NANO_2_NFW_WARM, AVERMEDIA_VOLAR_A868R, DVICO_BLUEBIRD_DUAL_4_REV_2, CONEXANT_D680_DMB, MYGICA_D689, NR__cxusb_table_index }; static struct usb_device_id cxusb_table[]; int cxusb_ctrl_msg(struct dvb_usb_device *d, u8 cmd, const u8 *wbuf, int wlen, u8 *rbuf, int rlen) { struct cxusb_state *st = d->priv; int ret; if (1 + wlen > MAX_XFER_SIZE) { warn("i2c wr: len=%d is too big!\n", wlen); return -EOPNOTSUPP; } if (rlen > MAX_XFER_SIZE) { warn("i2c rd: len=%d is too big!\n", rlen); return -EOPNOTSUPP; } mutex_lock(&d->data_mutex); st->data[0] = cmd; memcpy(&st->data[1], wbuf, wlen); ret = dvb_usb_generic_rw(d, st->data, 1 + wlen, st->data, rlen, 0); if (!ret && rbuf && rlen) memcpy(rbuf, st->data, rlen); mutex_unlock(&d->data_mutex); return ret; } /* GPIO */ static void cxusb_gpio_tuner(struct dvb_usb_device *d, int onoff) { struct cxusb_state *st = d->priv; u8 o[2], i; if (st->gpio_write_state[GPIO_TUNER] == onoff && !st->gpio_write_refresh[GPIO_TUNER]) return; o[0] = GPIO_TUNER; o[1] = onoff; cxusb_ctrl_msg(d, CMD_GPIO_WRITE, o, 2, &i, 1); if (i != 0x01) dev_info(&d->udev->dev, "gpio_write failed.\n"); st->gpio_write_state[GPIO_TUNER] = onoff; st->gpio_write_refresh[GPIO_TUNER] = false; } static int cxusb_bluebird_gpio_rw(struct dvb_usb_device *d, u8 changemask, u8 newval) { u8 o[2], gpio_state; int rc; o[0] = 0xff & ~changemask; /* mask of bits to keep */ o[1] = newval & changemask; /* new values for bits */ rc = cxusb_ctrl_msg(d, CMD_BLUEBIRD_GPIO_RW, o, 2, &gpio_state, 1); if (rc < 0 || (gpio_state & changemask) != (newval & changemask)) dev_info(&d->udev->dev, "bluebird_gpio_write failed.\n"); return rc < 0 ? rc : gpio_state; } static void cxusb_bluebird_gpio_pulse(struct dvb_usb_device *d, u8 pin, int low) { cxusb_bluebird_gpio_rw(d, pin, low ? 0 : pin); msleep(5); cxusb_bluebird_gpio_rw(d, pin, low ? pin : 0); } static void cxusb_nano2_led(struct dvb_usb_device *d, int onoff) { cxusb_bluebird_gpio_rw(d, 0x40, onoff ? 0 : 0x40); } static int cxusb_d680_dmb_gpio_tuner(struct dvb_usb_device *d, u8 addr, int onoff) { u8 o[2] = {addr, onoff}; u8 i; int rc; rc = cxusb_ctrl_msg(d, CMD_GPIO_WRITE, o, 2, &i, 1); if (rc < 0) return rc; if (i == 0x01) return 0; dev_info(&d->udev->dev, "gpio_write failed.\n"); return -EIO; } /* I2C */ static int cxusb_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); int ret; int i; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; for (i = 0; i < num; i++) { if (le16_to_cpu(d->udev->descriptor.idVendor) == USB_VID_MEDION) switch (msg[i].addr) { case 0x63: cxusb_gpio_tuner(d, 0); break; default: cxusb_gpio_tuner(d, 1); break; } if (msg[i].flags & I2C_M_RD) { /* read only */ u8 obuf[3], ibuf[MAX_XFER_SIZE]; if (1 + msg[i].len > sizeof(ibuf)) { warn("i2c rd: len=%d is too big!\n", msg[i].len); ret = -EOPNOTSUPP; goto unlock; } obuf[0] = 0; obuf[1] = msg[i].len; obuf[2] = msg[i].addr; if (cxusb_ctrl_msg(d, CMD_I2C_READ, obuf, 3, ibuf, 1 + msg[i].len) < 0) { warn("i2c read failed"); break; } memcpy(msg[i].buf, &ibuf[1], msg[i].len); } else if (i + 1 < num && (msg[i + 1].flags & I2C_M_RD) && msg[i].addr == msg[i + 1].addr) { /* write to then read from same address */ u8 obuf[MAX_XFER_SIZE], ibuf[MAX_XFER_SIZE]; if (3 + msg[i].len > sizeof(obuf)) { warn("i2c wr: len=%d is too big!\n", msg[i].len); ret = -EOPNOTSUPP; goto unlock; } if (1 + msg[i + 1].len > sizeof(ibuf)) { warn("i2c rd: len=%d is too big!\n", msg[i + 1].len); ret = -EOPNOTSUPP; goto unlock; } obuf[0] = msg[i].len; obuf[1] = msg[i + 1].len; obuf[2] = msg[i].addr; memcpy(&obuf[3], msg[i].buf, msg[i].len); if (cxusb_ctrl_msg(d, CMD_I2C_READ, obuf, 3 + msg[i].len, ibuf, 1 + msg[i + 1].len) < 0) break; if (ibuf[0] != 0x08) dev_info(&d->udev->dev, "i2c read may have failed\n"); memcpy(msg[i + 1].buf, &ibuf[1], msg[i + 1].len); i++; } else { /* write only */ u8 obuf[MAX_XFER_SIZE], ibuf; if (2 + msg[i].len > sizeof(obuf)) { warn("i2c wr: len=%d is too big!\n", msg[i].len); ret = -EOPNOTSUPP; goto unlock; } obuf[0] = msg[i].addr; obuf[1] = msg[i].len; memcpy(&obuf[2], msg[i].buf, msg[i].len); if (cxusb_ctrl_msg(d, CMD_I2C_WRITE, obuf, 2 + msg[i].len, &ibuf, 1) < 0) break; if (ibuf != 0x08) dev_info(&d->udev->dev, "i2c write may have failed\n"); } } if (i == num) ret = num; else ret = -EREMOTEIO; unlock: mutex_unlock(&d->i2c_mutex); return ret; } static u32 cxusb_i2c_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL; } static struct i2c_algorithm cxusb_i2c_algo = { .master_xfer = cxusb_i2c_xfer, .functionality = cxusb_i2c_func, }; static int _cxusb_power_ctrl(struct dvb_usb_device *d, int onoff) { u8 b = 0; dev_info(&d->udev->dev, "setting power %s\n", onoff ? "ON" : "OFF"); if (onoff) return cxusb_ctrl_msg(d, CMD_POWER_ON, &b, 1, NULL, 0); else return cxusb_ctrl_msg(d, CMD_POWER_OFF, &b, 1, NULL, 0); } static int cxusb_power_ctrl(struct dvb_usb_device *d, int onoff) { bool is_medion = d->props.devices[0].warm_ids[0] == &cxusb_table[MEDION_MD95700]; int ret; if (is_medion && !onoff) { struct cxusb_medion_dev *cxdev = d->priv; mutex_lock(&cxdev->open_lock); if (cxdev->open_type == CXUSB_OPEN_ANALOG) { dev_info(&d->udev->dev, "preventing DVB core from setting power OFF while we are in analog mode\n"); ret = -EBUSY; goto ret_unlock; } } ret = _cxusb_power_ctrl(d, onoff); ret_unlock: if (is_medion && !onoff) { struct cxusb_medion_dev *cxdev = d->priv; mutex_unlock(&cxdev->open_lock); } return ret; } static int cxusb_aver_power_ctrl(struct dvb_usb_device *d, int onoff) { int ret; if (!onoff) return cxusb_ctrl_msg(d, CMD_POWER_OFF, NULL, 0, NULL, 0); if (d->state == DVB_USB_STATE_INIT && usb_set_interface(d->udev, 0, 0) < 0) err("set interface failed"); do { /* Nothing */ } while (!(ret = cxusb_ctrl_msg(d, CMD_POWER_ON, NULL, 0, NULL, 0)) && !(ret = cxusb_ctrl_msg(d, 0x15, NULL, 0, NULL, 0)) && !(ret = cxusb_ctrl_msg(d, 0x17, NULL, 0, NULL, 0)) && 0); if (!ret) { /* * FIXME: We don't know why, but we need to configure the * lgdt3303 with the register settings below on resume */ int i; u8 buf; static const u8 bufs[] = { 0x0e, 0x2, 0x00, 0x7f, 0x0e, 0x2, 0x02, 0xfe, 0x0e, 0x2, 0x02, 0x01, 0x0e, 0x2, 0x00, 0x03, 0x0e, 0x2, 0x0d, 0x40, 0x0e, 0x2, 0x0e, 0x87, 0x0e, 0x2, 0x0f, 0x8e, 0x0e, 0x2, 0x10, 0x01, 0x0e, 0x2, 0x14, 0xd7, 0x0e, 0x2, 0x47, 0x88, }; msleep(20); for (i = 0; i < ARRAY_SIZE(bufs); i += 4 / sizeof(u8)) { ret = cxusb_ctrl_msg(d, CMD_I2C_WRITE, bufs + i, 4, &buf, 1); if (ret) break; if (buf != 0x8) return -EREMOTEIO; } } return ret; } static int cxusb_bluebird_power_ctrl(struct dvb_usb_device *d, int onoff) { u8 b = 0; if (onoff) return cxusb_ctrl_msg(d, CMD_POWER_ON, &b, 1, NULL, 0); else return 0; } static int cxusb_nano2_power_ctrl(struct dvb_usb_device *d, int onoff) { int rc = 0; rc = cxusb_power_ctrl(d, onoff); if (!onoff) cxusb_nano2_led(d, 0); return rc; } static int cxusb_d680_dmb_power_ctrl(struct dvb_usb_device *d, int onoff) { int ret; u8 b; ret = cxusb_power_ctrl(d, onoff); if (!onoff) return ret; msleep(128); cxusb_ctrl_msg(d, CMD_DIGITAL, NULL, 0, &b, 1); msleep(100); return ret; } static int cxusb_streaming_ctrl(struct dvb_usb_adapter *adap, int onoff) { struct dvb_usb_device *dvbdev = adap->dev; bool is_medion = dvbdev->props.devices[0].warm_ids[0] == &cxusb_table[MEDION_MD95700]; u8 buf[2] = { 0x03, 0x00 }; if (is_medion && onoff) { int ret; ret = cxusb_medion_get(dvbdev, CXUSB_OPEN_DIGITAL); if (ret != 0) return ret; } if (onoff) cxusb_ctrl_msg(dvbdev, CMD_STREAMING_ON, buf, 2, NULL, 0); else cxusb_ctrl_msg(dvbdev, CMD_STREAMING_OFF, NULL, 0, NULL, 0); if (is_medion && !onoff) cxusb_medion_put(dvbdev); return 0; } static int cxusb_aver_streaming_ctrl(struct dvb_usb_adapter *adap, int onoff) { if (onoff) cxusb_ctrl_msg(adap->dev, CMD_AVER_STREAM_ON, NULL, 0, NULL, 0); else cxusb_ctrl_msg(adap->dev, CMD_AVER_STREAM_OFF, NULL, 0, NULL, 0); return 0; } static void cxusb_d680_dmb_drain_message(struct dvb_usb_device *d) { int ep = d->props.generic_bulk_ctrl_endpoint; const int timeout = 100; const int junk_len = 32; u8 *junk; int rd_count; /* Discard remaining data in video pipe */ junk = kmalloc(junk_len, GFP_KERNEL); if (!junk) return; while (1) { if (usb_bulk_msg(d->udev, usb_rcvbulkpipe(d->udev, ep), junk, junk_len, &rd_count, timeout) < 0) break; if (!rd_count) break; } kfree(junk); } static void cxusb_d680_dmb_drain_video(struct dvb_usb_device *d) { struct usb_data_stream_properties *p = &d->props.adapter[0].fe[0].stream; const int timeout = 100; const int junk_len = p->u.bulk.buffersize; u8 *junk; int rd_count; /* Discard remaining data in video pipe */ junk = kmalloc(junk_len, GFP_KERNEL); if (!junk) return; while (1) { if (usb_bulk_msg(d->udev, usb_rcvbulkpipe(d->udev, p->endpoint), junk, junk_len, &rd_count, timeout) < 0) break; if (!rd_count) break; } kfree(junk); } static int cxusb_d680_dmb_streaming_ctrl(struct dvb_usb_adapter *adap, int onoff) { if (onoff) { u8 buf[2] = { 0x03, 0x00 }; cxusb_d680_dmb_drain_video(adap->dev); return cxusb_ctrl_msg(adap->dev, CMD_STREAMING_ON, buf, sizeof(buf), NULL, 0); } else { int ret = cxusb_ctrl_msg(adap->dev, CMD_STREAMING_OFF, NULL, 0, NULL, 0); return ret; } } static int cxusb_rc_query(struct dvb_usb_device *d) { u8 ircode[4]; if (cxusb_ctrl_msg(d, CMD_GET_IR_CODE, NULL, 0, ircode, 4) < 0) return 0; if (ircode[2] || ircode[3]) rc_keydown(d->rc_dev, RC_PROTO_NEC, RC_SCANCODE_NEC(~ircode[2] & 0xff, ircode[3]), 0); return 0; } static int cxusb_bluebird2_rc_query(struct dvb_usb_device *d) { u8 ircode[4]; struct i2c_msg msg = { .addr = 0x6b, .flags = I2C_M_RD, .buf = ircode, .len = 4 }; if (cxusb_i2c_xfer(&d->i2c_adap, &msg, 1) != 1) return 0; if (ircode[1] || ircode[2]) rc_keydown(d->rc_dev, RC_PROTO_NEC, RC_SCANCODE_NEC(~ircode[1] & 0xff, ircode[2]), 0); return 0; } static int cxusb_d680_dmb_rc_query(struct dvb_usb_device *d) { u8 ircode[2]; if (cxusb_ctrl_msg(d, 0x10, NULL, 0, ircode, 2) < 0) return 0; if (ircode[0] || ircode[1]) rc_keydown(d->rc_dev, RC_PROTO_UNKNOWN, RC_SCANCODE_RC5(ircode[0], ircode[1]), 0); return 0; } static int cxusb_dee1601_demod_init(struct dvb_frontend *fe) { static u8 clock_config[] = { CLOCK_CTL, 0x38, 0x28 }; static u8 reset[] = { RESET, 0x80 }; static u8 adc_ctl_1_cfg[] = { ADC_CTL_1, 0x40 }; static u8 agc_cfg[] = { AGC_TARGET, 0x28, 0x20 }; static u8 gpp_ctl_cfg[] = { GPP_CTL, 0x33 }; static u8 capt_range_cfg[] = { CAPT_RANGE, 0x32 }; mt352_write(fe, clock_config, sizeof(clock_config)); udelay(200); mt352_write(fe, reset, sizeof(reset)); mt352_write(fe, adc_ctl_1_cfg, sizeof(adc_ctl_1_cfg)); mt352_write(fe, agc_cfg, sizeof(agc_cfg)); mt352_write(fe, gpp_ctl_cfg, sizeof(gpp_ctl_cfg)); mt352_write(fe, capt_range_cfg, sizeof(capt_range_cfg)); return 0; } static int cxusb_mt352_demod_init(struct dvb_frontend *fe) { /* used in both lgz201 and th7579 */ static u8 clock_config[] = { CLOCK_CTL, 0x38, 0x29 }; static u8 reset[] = { RESET, 0x80 }; static u8 adc_ctl_1_cfg[] = { ADC_CTL_1, 0x40 }; static u8 agc_cfg[] = { AGC_TARGET, 0x24, 0x20 }; static u8 gpp_ctl_cfg[] = { GPP_CTL, 0x33 }; static u8 capt_range_cfg[] = { CAPT_RANGE, 0x32 }; mt352_write(fe, clock_config, sizeof(clock_config)); udelay(200); mt352_write(fe, reset, sizeof(reset)); mt352_write(fe, adc_ctl_1_cfg, sizeof(adc_ctl_1_cfg)); mt352_write(fe, agc_cfg, sizeof(agc_cfg)); mt352_write(fe, gpp_ctl_cfg, sizeof(gpp_ctl_cfg)); mt352_write(fe, capt_range_cfg, sizeof(capt_range_cfg)); return 0; } static struct cx22702_config cxusb_cx22702_config = { .demod_address = 0x63, .output_mode = CX22702_PARALLEL_OUTPUT, }; static struct lgdt330x_config cxusb_lgdt3303_config = { .demod_chip = LGDT3303, }; static struct lgdt330x_config cxusb_aver_lgdt3303_config = { .demod_chip = LGDT3303, .clock_polarity_flip = 2, }; static struct mt352_config cxusb_dee1601_config = { .demod_address = 0x0f, .demod_init = cxusb_dee1601_demod_init, }; static struct zl10353_config cxusb_zl10353_dee1601_config = { .demod_address = 0x0f, .parallel_ts = 1, }; static struct mt352_config cxusb_mt352_config = { /* used in both lgz201 and th7579 */ .demod_address = 0x0f, .demod_init = cxusb_mt352_demod_init, }; static struct zl10353_config cxusb_zl10353_xc3028_config = { .demod_address = 0x0f, .if2 = 45600, .no_tuner = 1, .parallel_ts = 1, }; static struct zl10353_config cxusb_zl10353_xc3028_config_no_i2c_gate = { .demod_address = 0x0f, .if2 = 45600, .no_tuner = 1, .parallel_ts = 1, .disable_i2c_gate_ctrl = 1, }; static struct mt352_config cxusb_mt352_xc3028_config = { .demod_address = 0x0f, .if2 = 4560, .no_tuner = 1, .demod_init = cxusb_mt352_demod_init, }; /* FIXME: needs tweaking */ static struct mxl5005s_config aver_a868r_tuner = { .i2c_address = 0x63, .if_freq = 6000000UL, .xtal_freq = CRYSTAL_FREQ_16000000HZ, .agc_mode = MXL_SINGLE_AGC, .tracking_filter = MXL_TF_C, .rssi_enable = MXL_RSSI_ENABLE, .cap_select = MXL_CAP_SEL_ENABLE, .div_out = MXL_DIV_OUT_4, .clock_out = MXL_CLOCK_OUT_DISABLE, .output_load = MXL5005S_IF_OUTPUT_LOAD_200_OHM, .top = MXL5005S_TOP_25P2, .mod_mode = MXL_DIGITAL_MODE, .if_mode = MXL_ZERO_IF, .AgcMasterByte = 0x00, }; /* FIXME: needs tweaking */ static struct mxl5005s_config d680_dmb_tuner = { .i2c_address = 0x63, .if_freq = 36125000UL, .xtal_freq = CRYSTAL_FREQ_16000000HZ, .agc_mode = MXL_SINGLE_AGC, .tracking_filter = MXL_TF_C, .rssi_enable = MXL_RSSI_ENABLE, .cap_select = MXL_CAP_SEL_ENABLE, .div_out = MXL_DIV_OUT_4, .clock_out = MXL_CLOCK_OUT_DISABLE, .output_load = MXL5005S_IF_OUTPUT_LOAD_200_OHM, .top = MXL5005S_TOP_25P2, .mod_mode = MXL_DIGITAL_MODE, .if_mode = MXL_ZERO_IF, .AgcMasterByte = 0x00, }; static struct max2165_config mygica_d689_max2165_cfg = { .i2c_address = 0x60, .osc_clk = 20 }; /* Callbacks for DVB USB */ static int cxusb_fmd1216me_tuner_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *dvbdev = adap->dev; bool is_medion = dvbdev->props.devices[0].warm_ids[0] == &cxusb_table[MEDION_MD95700]; dvb_attach(simple_tuner_attach, adap->fe_adap[0].fe, &dvbdev->i2c_adap, 0x61, TUNER_PHILIPS_FMD1216ME_MK3); if (is_medion && adap->fe_adap[0].fe) /* * make sure that DVB core won't put to sleep (reset, really) * tuner when we might be open in analog mode */ adap->fe_adap[0].fe->ops.tuner_ops.sleep = NULL; return 0; } static int cxusb_dee1601_tuner_attach(struct dvb_usb_adapter *adap) { dvb_attach(dvb_pll_attach, adap->fe_adap[0].fe, 0x61, NULL, DVB_PLL_THOMSON_DTT7579); return 0; } static int cxusb_lgz201_tuner_attach(struct dvb_usb_adapter *adap) { dvb_attach(dvb_pll_attach, adap->fe_adap[0].fe, 0x61, NULL, DVB_PLL_LG_Z201); return 0; } static int cxusb_dtt7579_tuner_attach(struct dvb_usb_adapter *adap) { dvb_attach(dvb_pll_attach, adap->fe_adap[0].fe, 0x60, NULL, DVB_PLL_THOMSON_DTT7579); return 0; } static int cxusb_lgh064f_tuner_attach(struct dvb_usb_adapter *adap) { dvb_attach(simple_tuner_attach, adap->fe_adap[0].fe, &adap->dev->i2c_adap, 0x61, TUNER_LG_TDVS_H06XF); return 0; } static int dvico_bluebird_xc2028_callback(void *ptr, int component, int command, int arg) { struct dvb_usb_adapter *adap = ptr; struct dvb_usb_device *d = adap->dev; switch (command) { case XC2028_TUNER_RESET: dev_info(&d->udev->dev, "XC2028_TUNER_RESET %d\n", arg); cxusb_bluebird_gpio_pulse(d, 0x01, 1); break; case XC2028_RESET_CLK: dev_info(&d->udev->dev, "XC2028_RESET_CLK %d\n", arg); break; case XC2028_I2C_FLUSH: break; default: dev_info(&d->udev->dev, "unknown command %d, arg %d\n", command, arg); return -EINVAL; } return 0; } static int cxusb_dvico_xc3028_tuner_attach(struct dvb_usb_adapter *adap) { struct dvb_frontend *fe; struct xc2028_config cfg = { .i2c_adap = &adap->dev->i2c_adap, .i2c_addr = 0x61, }; static struct xc2028_ctrl ctl = { .fname = XC2028_DEFAULT_FIRMWARE, .max_len = 64, .demod = XC3028_FE_ZARLINK456, }; /* FIXME: generalize & move to common area */ adap->fe_adap[0].fe->callback = dvico_bluebird_xc2028_callback; fe = dvb_attach(xc2028_attach, adap->fe_adap[0].fe, &cfg); if (!fe || !fe->ops.tuner_ops.set_config) return -EIO; fe->ops.tuner_ops.set_config(fe, &ctl); return 0; } static int cxusb_mxl5003s_tuner_attach(struct dvb_usb_adapter *adap) { dvb_attach(mxl5005s_attach, adap->fe_adap[0].fe, &adap->dev->i2c_adap, &aver_a868r_tuner); return 0; } static int cxusb_d680_dmb_tuner_attach(struct dvb_usb_adapter *adap) { struct dvb_frontend *fe; fe = dvb_attach(mxl5005s_attach, adap->fe_adap[0].fe, &adap->dev->i2c_adap, &d680_dmb_tuner); return (!fe) ? -EIO : 0; } static int cxusb_mygica_d689_tuner_attach(struct dvb_usb_adapter *adap) { struct dvb_frontend *fe; fe = dvb_attach(max2165_attach, adap->fe_adap[0].fe, &adap->dev->i2c_adap, &mygica_d689_max2165_cfg); return (!fe) ? -EIO : 0; } static int cxusb_medion_fe_ts_bus_ctrl(struct dvb_frontend *fe, int acquire) { struct dvb_usb_adapter *adap = fe->dvb->priv; struct dvb_usb_device *dvbdev = adap->dev; if (acquire) return cxusb_medion_get(dvbdev, CXUSB_OPEN_DIGITAL); cxusb_medion_put(dvbdev); return 0; } static int cxusb_medion_set_mode(struct dvb_usb_device *dvbdev, bool digital) { struct cxusb_state *st = dvbdev->priv; int ret; u8 b; unsigned int i; /* * switching mode while doing an I2C transaction often causes * the device to crash */ mutex_lock(&dvbdev->i2c_mutex); if (digital) { ret = usb_set_interface(dvbdev->udev, 0, 6); if (ret != 0) { dev_err(&dvbdev->udev->dev, "digital interface selection failed (%d)\n", ret); goto ret_unlock; } } else { ret = usb_set_interface(dvbdev->udev, 0, 1); if (ret != 0) { dev_err(&dvbdev->udev->dev, "analog interface selection failed (%d)\n", ret); goto ret_unlock; } } /* pipes need to be cleared after setting interface */ ret = usb_clear_halt(dvbdev->udev, usb_rcvbulkpipe(dvbdev->udev, 1)); if (ret != 0) dev_warn(&dvbdev->udev->dev, "clear halt on IN pipe failed (%d)\n", ret); ret = usb_clear_halt(dvbdev->udev, usb_sndbulkpipe(dvbdev->udev, 1)); if (ret != 0) dev_warn(&dvbdev->udev->dev, "clear halt on OUT pipe failed (%d)\n", ret); ret = cxusb_ctrl_msg(dvbdev, digital ? CMD_DIGITAL : CMD_ANALOG, NULL, 0, &b, 1); if (ret != 0) { dev_err(&dvbdev->udev->dev, "mode switch failed (%d)\n", ret); goto ret_unlock; } /* mode switch seems to reset GPIO states */ for (i = 0; i < ARRAY_SIZE(st->gpio_write_refresh); i++) st->gpio_write_refresh[i] = true; ret_unlock: mutex_unlock(&dvbdev->i2c_mutex); return ret; } static int cxusb_cx22702_frontend_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *dvbdev = adap->dev; bool is_medion = dvbdev->props.devices[0].warm_ids[0] == &cxusb_table[MEDION_MD95700]; if (is_medion) { int ret; ret = cxusb_medion_set_mode(dvbdev, true); if (ret) return ret; } adap->fe_adap[0].fe = dvb_attach(cx22702_attach, &cxusb_cx22702_config, &dvbdev->i2c_adap); if (!adap->fe_adap[0].fe) return -EIO; if (is_medion) adap->fe_adap[0].fe->ops.ts_bus_ctrl = cxusb_medion_fe_ts_bus_ctrl; return 0; } static int cxusb_lgdt3303_frontend_attach(struct dvb_usb_adapter *adap) { if (usb_set_interface(adap->dev->udev, 0, 7) < 0) err("set interface failed"); cxusb_ctrl_msg(adap->dev, CMD_DIGITAL, NULL, 0, NULL, 0); adap->fe_adap[0].fe = dvb_attach(lgdt330x_attach, &cxusb_lgdt3303_config, 0x0e, &adap->dev->i2c_adap); if (adap->fe_adap[0].fe) return 0; return -EIO; } static int cxusb_aver_lgdt3303_frontend_attach(struct dvb_usb_adapter *adap) { adap->fe_adap[0].fe = dvb_attach(lgdt330x_attach, &cxusb_aver_lgdt3303_config, 0x0e, &adap->dev->i2c_adap); if (adap->fe_adap[0].fe) return 0; return -EIO; } static int cxusb_mt352_frontend_attach(struct dvb_usb_adapter *adap) { /* used in both lgz201 and th7579 */ if (usb_set_interface(adap->dev->udev, 0, 0) < 0) err("set interface failed"); cxusb_ctrl_msg(adap->dev, CMD_DIGITAL, NULL, 0, NULL, 0); adap->fe_adap[0].fe = dvb_attach(mt352_attach, &cxusb_mt352_config, &adap->dev->i2c_adap); if (adap->fe_adap[0].fe) return 0; return -EIO; } static int cxusb_dee1601_frontend_attach(struct dvb_usb_adapter *adap) { if (usb_set_interface(adap->dev->udev, 0, 0) < 0) err("set interface failed"); cxusb_ctrl_msg(adap->dev, CMD_DIGITAL, NULL, 0, NULL, 0); adap->fe_adap[0].fe = dvb_attach(mt352_attach, &cxusb_dee1601_config, &adap->dev->i2c_adap); if (adap->fe_adap[0].fe) return 0; adap->fe_adap[0].fe = dvb_attach(zl10353_attach, &cxusb_zl10353_dee1601_config, &adap->dev->i2c_adap); if (adap->fe_adap[0].fe) return 0; return -EIO; } static int cxusb_dualdig4_frontend_attach(struct dvb_usb_adapter *adap) { u8 ircode[4]; int i; struct i2c_msg msg = { .addr = 0x6b, .flags = I2C_M_RD, .buf = ircode, .len = 4 }; if (usb_set_interface(adap->dev->udev, 0, 1) < 0) err("set interface failed"); cxusb_ctrl_msg(adap->dev, CMD_DIGITAL, NULL, 0, NULL, 0); /* reset the tuner and demodulator */ cxusb_bluebird_gpio_rw(adap->dev, 0x04, 0); cxusb_bluebird_gpio_pulse(adap->dev, 0x01, 1); cxusb_bluebird_gpio_pulse(adap->dev, 0x02, 1); adap->fe_adap[0].fe = dvb_attach(zl10353_attach, &cxusb_zl10353_xc3028_config_no_i2c_gate, &adap->dev->i2c_adap); if (!adap->fe_adap[0].fe) return -EIO; /* try to determine if there is no IR decoder on the I2C bus */ for (i = 0; adap->dev->props.rc.core.rc_codes && i < 5; i++) { msleep(20); if (cxusb_i2c_xfer(&adap->dev->i2c_adap, &msg, 1) != 1) goto no_IR; if (ircode[0] == 0 && ircode[1] == 0) continue; if (ircode[2] + ircode[3] != 0xff) { no_IR: adap->dev->props.rc.core.rc_codes = NULL; info("No IR receiver detected on this device."); break; } } return 0; } static struct dibx000_agc_config dib7070_agc_config = { .band_caps = BAND_UHF | BAND_VHF | BAND_LBAND | BAND_SBAND, /* * P_agc_use_sd_mod1=0, P_agc_use_sd_mod2=0, P_agc_freq_pwm_div=5, * P_agc_inv_pwm1=0, P_agc_inv_pwm2=0, P_agc_inh_dc_rv_est=0, * P_agc_time_est=3, P_agc_freeze=0, P_agc_nb_est=5, P_agc_write=0 */ .setup = (0 << 15) | (0 << 14) | (5 << 11) | (0 << 10) | (0 << 9) | (0 << 8) | (3 << 5) | (0 << 4) | (5 << 1) | (0 << 0), .inv_gain = 600, .time_stabiliz = 10, .alpha_level = 0, .thlock = 118, .wbd_inv = 0, .wbd_ref = 3530, .wbd_sel = 1, .wbd_alpha = 5, .agc1_max = 65535, .agc1_min = 0, .agc2_max = 65535, .agc2_min = 0, .agc1_pt1 = 0, .agc1_pt2 = 40, .agc1_pt3 = 183, .agc1_slope1 = 206, .agc1_slope2 = 255, .agc2_pt1 = 72, .agc2_pt2 = 152, .agc2_slope1 = 88, .agc2_slope2 = 90, .alpha_mant = 17, .alpha_exp = 27, .beta_mant = 23, .beta_exp = 51, .perform_agc_softsplit = 0, }; static struct dibx000_bandwidth_config dib7070_bw_config_12_mhz = { .internal = 60000, .sampling = 15000, .pll_prediv = 1, .pll_ratio = 20, .pll_range = 3, .pll_reset = 1, .pll_bypass = 0, .enable_refdiv = 0, .bypclk_div = 0, .IO_CLK_en_core = 1, .ADClkSrc = 1, .modulo = 2, /* refsel, sel, freq_15k */ .sad_cfg = (3 << 14) | (1 << 12) | (524 << 0), .ifreq = (0 << 25) | 0, .timf = 20452225, .xtal_hz = 12000000, }; static struct dib7000p_config cxusb_dualdig4_rev2_config = { .output_mode = OUTMODE_MPEG2_PAR_GATED_CLK, .output_mpeg2_in_188_bytes = 1, .agc_config_count = 1, .agc = &dib7070_agc_config, .bw = &dib7070_bw_config_12_mhz, .tuner_is_baseband = 1, .spur_protect = 1, .gpio_dir = 0xfcef, .gpio_val = 0x0110, .gpio_pwm_pos = DIB7000P_GPIO_DEFAULT_PWM_POS, .hostbus_diversity = 1, }; struct dib0700_adapter_state { int (*set_param_save)(struct dvb_frontend *fe); struct dib7000p_ops dib7000p_ops; }; static int cxusb_dualdig4_rev2_frontend_attach(struct dvb_usb_adapter *adap) { struct dib0700_adapter_state *state = adap->priv; if (usb_set_interface(adap->dev->udev, 0, 1) < 0) err("set interface failed"); cxusb_ctrl_msg(adap->dev, CMD_DIGITAL, NULL, 0, NULL, 0); cxusb_bluebird_gpio_pulse(adap->dev, 0x02, 1); if (!dvb_attach(dib7000p_attach, &state->dib7000p_ops)) return -ENODEV; if (state->dib7000p_ops.i2c_enumeration(&adap->dev->i2c_adap, 1, 18, &cxusb_dualdig4_rev2_config) < 0) { pr_warn("Unable to enumerate dib7000p\n"); return -ENODEV; } adap->fe_adap[0].fe = state->dib7000p_ops.init(&adap->dev->i2c_adap, 0x80, &cxusb_dualdig4_rev2_config); if (!adap->fe_adap[0].fe) return -EIO; return 0; } static int dib7070_tuner_reset(struct dvb_frontend *fe, int onoff) { struct dvb_usb_adapter *adap = fe->dvb->priv; struct dib0700_adapter_state *state = adap->priv; return state->dib7000p_ops.set_gpio(fe, 8, 0, !onoff); } static int dib7070_tuner_sleep(struct dvb_frontend *fe, int onoff) { return 0; } static struct dib0070_config dib7070p_dib0070_config = { .i2c_address = DEFAULT_DIB0070_I2C_ADDRESS, .reset = dib7070_tuner_reset, .sleep = dib7070_tuner_sleep, .clock_khz = 12000, }; static int dib7070_set_param_override(struct dvb_frontend *fe) { struct dtv_frontend_properties *p = &fe->dtv_property_cache; struct dvb_usb_adapter *adap = fe->dvb->priv; struct dib0700_adapter_state *state = adap->priv; u16 offset; u8 band = BAND_OF_FREQUENCY(p->frequency / 1000); switch (band) { case BAND_VHF: offset = 950; break; default: case BAND_UHF: offset = 550; break; } state->dib7000p_ops.set_wbd_ref(fe, offset + dib0070_wbd_offset(fe)); return state->set_param_save(fe); } static int cxusb_dualdig4_rev2_tuner_attach(struct dvb_usb_adapter *adap) { struct dib0700_adapter_state *st = adap->priv; struct i2c_adapter *tun_i2c; /* * No need to call dvb7000p_attach here, as it was called * already, as frontend_attach method is called first, and * tuner_attach is only called on success. */ tun_i2c = st->dib7000p_ops.get_i2c_master(adap->fe_adap[0].fe, DIBX000_I2C_INTERFACE_TUNER, 1); if (dvb_attach(dib0070_attach, adap->fe_adap[0].fe, tun_i2c, &dib7070p_dib0070_config) == NULL) return -ENODEV; st->set_param_save = adap->fe_adap[0].fe->ops.tuner_ops.set_params; adap->fe_adap[0].fe->ops.tuner_ops.set_params = dib7070_set_param_override; return 0; } static int cxusb_nano2_frontend_attach(struct dvb_usb_adapter *adap) { if (usb_set_interface(adap->dev->udev, 0, 1) < 0) err("set interface failed"); cxusb_ctrl_msg(adap->dev, CMD_DIGITAL, NULL, 0, NULL, 0); /* reset the tuner and demodulator */ cxusb_bluebird_gpio_rw(adap->dev, 0x04, 0); cxusb_bluebird_gpio_pulse(adap->dev, 0x01, 1); cxusb_bluebird_gpio_pulse(adap->dev, 0x02, 1); adap->fe_adap[0].fe = dvb_attach(zl10353_attach, &cxusb_zl10353_xc3028_config, &adap->dev->i2c_adap); if (adap->fe_adap[0].fe) return 0; adap->fe_adap[0].fe = dvb_attach(mt352_attach, &cxusb_mt352_xc3028_config, &adap->dev->i2c_adap); if (adap->fe_adap[0].fe) return 0; return -EIO; } static struct lgs8gxx_config d680_lgs8gl5_cfg = { .prod = LGS8GXX_PROD_LGS8GL5, .demod_address = 0x19, .serial_ts = 0, .ts_clk_pol = 0, .ts_clk_gated = 1, .if_clk_freq = 30400, /* 30.4 MHz */ .if_freq = 5725, /* 5.725 MHz */ .if_neg_center = 0, .ext_adc = 0, .adc_signed = 0, .if_neg_edge = 0, }; static int cxusb_d680_dmb_frontend_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap->dev; int n; /* Select required USB configuration */ if (usb_set_interface(d->udev, 0, 0) < 0) err("set interface failed"); /* Unblock all USB pipes */ usb_clear_halt(d->udev, usb_sndbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint)); usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint)); usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, d->props.adapter[0].fe[0].stream.endpoint)); /* Drain USB pipes to avoid hang after reboot */ for (n = 0; n < 5; n++) { cxusb_d680_dmb_drain_message(d); cxusb_d680_dmb_drain_video(d); msleep(200); } /* Reset the tuner */ if (cxusb_d680_dmb_gpio_tuner(d, 0x07, 0) < 0) { err("clear tuner gpio failed"); return -EIO; } msleep(100); if (cxusb_d680_dmb_gpio_tuner(d, 0x07, 1) < 0) { err("set tuner gpio failed"); return -EIO; } msleep(100); /* Attach frontend */ adap->fe_adap[0].fe = dvb_attach(lgs8gxx_attach, &d680_lgs8gl5_cfg, &d->i2c_adap); if (!adap->fe_adap[0].fe) return -EIO; return 0; } static struct atbm8830_config mygica_d689_atbm8830_cfg = { .prod = ATBM8830_PROD_8830, .demod_address = 0x40, .serial_ts = 0, .ts_sampling_edge = 1, .ts_clk_gated = 0, .osc_clk_freq = 30400, /* in kHz */ .if_freq = 0, /* zero IF */ .zif_swap_iq = 1, .agc_min = 0x2E, .agc_max = 0x90, .agc_hold_loop = 0, }; static int cxusb_mygica_d689_frontend_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap->dev; /* Select required USB configuration */ if (usb_set_interface(d->udev, 0, 0) < 0) err("set interface failed"); /* Unblock all USB pipes */ usb_clear_halt(d->udev, usb_sndbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint)); usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint)); usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, d->props.adapter[0].fe[0].stream.endpoint)); /* Reset the tuner */ if (cxusb_d680_dmb_gpio_tuner(d, 0x07, 0) < 0) { err("clear tuner gpio failed"); return -EIO; } msleep(100); if (cxusb_d680_dmb_gpio_tuner(d, 0x07, 1) < 0) { err("set tuner gpio failed"); return -EIO; } msleep(100); /* Attach frontend */ adap->fe_adap[0].fe = dvb_attach(atbm8830_attach, &mygica_d689_atbm8830_cfg, &d->i2c_adap); if (!adap->fe_adap[0].fe) return -EIO; return 0; } /* * DViCO has shipped two devices with the same USB ID, but only one of them * needs a firmware download. Check the device class details to see if they * have non-default values to decide whether the device is actually cold or * not, and forget a match if it turns out we selected the wrong device. */ static int bluebird_fx2_identify_state(struct usb_device *udev, const struct dvb_usb_device_properties *props, const struct dvb_usb_device_description **desc, int *cold) { int wascold = *cold; *cold = udev->descriptor.bDeviceClass == 0xff && udev->descriptor.bDeviceSubClass == 0xff && udev->descriptor.bDeviceProtocol == 0xff; if (*cold && !wascold) *desc = NULL; return 0; } /* * DViCO bluebird firmware needs the "warm" product ID to be patched into the * firmware file before download. */ static const int dvico_firmware_id_offsets[] = { 6638, 3204 }; static int bluebird_patch_dvico_firmware_download(struct usb_device *udev, const struct firmware *fw) { int pos; for (pos = 0; pos < ARRAY_SIZE(dvico_firmware_id_offsets); pos++) { int idoff = dvico_firmware_id_offsets[pos]; if (fw->size < idoff + 4) continue; if (fw->data[idoff] == (USB_VID_DVICO & 0xff) && fw->data[idoff + 1] == USB_VID_DVICO >> 8) { struct firmware new_fw; u8 *new_fw_data = vmalloc(fw->size); int ret; if (!new_fw_data) return -ENOMEM; memcpy(new_fw_data, fw->data, fw->size); new_fw.size = fw->size; new_fw.data = new_fw_data; new_fw_data[idoff + 2] = le16_to_cpu(udev->descriptor.idProduct) + 1; new_fw_data[idoff + 3] = le16_to_cpu(udev->descriptor.idProduct) >> 8; ret = usb_cypress_load_firmware(udev, &new_fw, CYPRESS_FX2); vfree(new_fw_data); return ret; } } return -EINVAL; } int cxusb_medion_get(struct dvb_usb_device *dvbdev, enum cxusb_open_type open_type) { struct cxusb_medion_dev *cxdev = dvbdev->priv; int ret = 0; mutex_lock(&cxdev->open_lock); if (WARN_ON((cxdev->open_type == CXUSB_OPEN_INIT || cxdev->open_type == CXUSB_OPEN_NONE) && cxdev->open_ctr != 0)) { ret = -EINVAL; goto ret_unlock; } if (cxdev->open_type == CXUSB_OPEN_INIT) { ret = -EAGAIN; goto ret_unlock; } if (cxdev->open_ctr == 0) { if (cxdev->open_type != open_type) { dev_info(&dvbdev->udev->dev, "will acquire and switch to %s\n", open_type == CXUSB_OPEN_ANALOG ? "analog" : "digital"); if (open_type == CXUSB_OPEN_ANALOG) { ret = _cxusb_power_ctrl(dvbdev, 1); if (ret != 0) dev_warn(&dvbdev->udev->dev, "powerup for analog switch failed (%d)\n", ret); ret = cxusb_medion_set_mode(dvbdev, false); if (ret != 0) goto ret_unlock; ret = cxusb_medion_analog_init(dvbdev); if (ret != 0) goto ret_unlock; } else { /* digital */ ret = _cxusb_power_ctrl(dvbdev, 1); if (ret != 0) dev_warn(&dvbdev->udev->dev, "powerup for digital switch failed (%d)\n", ret); ret = cxusb_medion_set_mode(dvbdev, true); if (ret != 0) goto ret_unlock; } cxdev->open_type = open_type; } else { dev_info(&dvbdev->udev->dev, "reacquired idle %s\n", open_type == CXUSB_OPEN_ANALOG ? "analog" : "digital"); } cxdev->open_ctr = 1; } else if (cxdev->open_type == open_type) { cxdev->open_ctr++; dev_info(&dvbdev->udev->dev, "acquired %s\n", open_type == CXUSB_OPEN_ANALOG ? "analog" : "digital"); } else { ret = -EBUSY; } ret_unlock: mutex_unlock(&cxdev->open_lock); return ret; } void cxusb_medion_put(struct dvb_usb_device *dvbdev) { struct cxusb_medion_dev *cxdev = dvbdev->priv; mutex_lock(&cxdev->open_lock); if (cxdev->open_type == CXUSB_OPEN_INIT) { WARN_ON(cxdev->open_ctr != 0); cxdev->open_type = CXUSB_OPEN_NONE; goto unlock; } if (!WARN_ON(cxdev->open_ctr < 1)) { cxdev->open_ctr--; dev_info(&dvbdev->udev->dev, "release %s\n", cxdev->open_type == CXUSB_OPEN_ANALOG ? "analog" : "digital"); } unlock: mutex_unlock(&cxdev->open_lock); } /* DVB USB Driver stuff */ static struct dvb_usb_device_properties cxusb_medion_properties; static struct dvb_usb_device_properties cxusb_bluebird_lgh064f_properties; static struct dvb_usb_device_properties cxusb_bluebird_dee1601_properties; static struct dvb_usb_device_properties cxusb_bluebird_lgz201_properties; static struct dvb_usb_device_properties cxusb_bluebird_dtt7579_properties; static struct dvb_usb_device_properties cxusb_bluebird_dualdig4_properties; static struct dvb_usb_device_properties cxusb_bluebird_dualdig4_rev2_properties; static struct dvb_usb_device_properties cxusb_bluebird_nano2_properties; static struct dvb_usb_device_properties cxusb_bluebird_nano2_needsfirmware_properties; static struct dvb_usb_device_properties cxusb_aver_a868r_properties; static struct dvb_usb_device_properties cxusb_d680_dmb_properties; static struct dvb_usb_device_properties cxusb_mygica_d689_properties; static int cxusb_medion_priv_init(struct dvb_usb_device *dvbdev) { struct cxusb_medion_dev *cxdev = dvbdev->priv; cxdev->dvbdev = dvbdev; cxdev->open_type = CXUSB_OPEN_INIT; mutex_init(&cxdev->open_lock); return 0; } static void cxusb_medion_priv_destroy(struct dvb_usb_device *dvbdev) { struct cxusb_medion_dev *cxdev = dvbdev->priv; mutex_destroy(&cxdev->open_lock); } static bool cxusb_medion_check_altsetting(struct usb_host_interface *as) { unsigned int ctr; for (ctr = 0; ctr < as->desc.bNumEndpoints; ctr++) { if ((as->endpoint[ctr].desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK) != 2) continue; if (as->endpoint[ctr].desc.bEndpointAddress & USB_DIR_IN && ((as->endpoint[ctr].desc.bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_ISOC)) return true; break; } return false; } static bool cxusb_medion_check_intf(struct usb_interface *intf) { unsigned int ctr; if (intf->num_altsetting < 2) { dev_err(intf->usb_dev, "no alternate interface"); return false; } for (ctr = 0; ctr < intf->num_altsetting; ctr++) { if (intf->altsetting[ctr].desc.bAlternateSetting != 1) continue; if (cxusb_medion_check_altsetting(&intf->altsetting[ctr])) return true; break; } dev_err(intf->usb_dev, "no iso interface"); return false; } static int cxusb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct dvb_usb_device *dvbdev; int ret; /* Medion 95700 */ if (!dvb_usb_device_init(intf, &cxusb_medion_properties, THIS_MODULE, &dvbdev, adapter_nr)) { if (!cxusb_medion_check_intf(intf)) { ret = -ENODEV; goto ret_uninit; } _cxusb_power_ctrl(dvbdev, 1); ret = cxusb_medion_set_mode(dvbdev, false); if (ret) goto ret_uninit; ret = cxusb_medion_register_analog(dvbdev); cxusb_medion_set_mode(dvbdev, true); _cxusb_power_ctrl(dvbdev, 0); if (ret != 0) goto ret_uninit; /* release device from INIT mode to normal operation */ cxusb_medion_put(dvbdev); return 0; } else if (!dvb_usb_device_init(intf, &cxusb_bluebird_lgh064f_properties, THIS_MODULE, NULL, adapter_nr) || !dvb_usb_device_init(intf, &cxusb_bluebird_dee1601_properties, THIS_MODULE, NULL, adapter_nr) || !dvb_usb_device_init(intf, &cxusb_bluebird_lgz201_properties, THIS_MODULE, NULL, adapter_nr) || !dvb_usb_device_init(intf, &cxusb_bluebird_dtt7579_properties, THIS_MODULE, NULL, adapter_nr) || !dvb_usb_device_init(intf, &cxusb_bluebird_dualdig4_properties, THIS_MODULE, NULL, adapter_nr) || !dvb_usb_device_init(intf, &cxusb_bluebird_nano2_properties, THIS_MODULE, NULL, adapter_nr) || !dvb_usb_device_init(intf, &cxusb_bluebird_nano2_needsfirmware_properties, THIS_MODULE, NULL, adapter_nr) || !dvb_usb_device_init(intf, &cxusb_aver_a868r_properties, THIS_MODULE, NULL, adapter_nr) || !dvb_usb_device_init(intf, &cxusb_bluebird_dualdig4_rev2_properties, THIS_MODULE, NULL, adapter_nr) || !dvb_usb_device_init(intf, &cxusb_d680_dmb_properties, THIS_MODULE, NULL, adapter_nr) || !dvb_usb_device_init(intf, &cxusb_mygica_d689_properties, THIS_MODULE, NULL, adapter_nr) || 0) return 0; return -EINVAL; ret_uninit: dvb_usb_device_exit(intf); return ret; } static void cxusb_disconnect(struct usb_interface *intf) { struct dvb_usb_device *d = usb_get_intfdata(intf); struct cxusb_state *st = d->priv; struct i2c_client *client; if (d->props.devices[0].warm_ids[0] == &cxusb_table[MEDION_MD95700]) cxusb_medion_unregister_analog(d); /* remove I2C client for tuner */ client = st->i2c_client_tuner; if (client) { module_put(client->dev.driver->owner); i2c_unregister_device(client); } /* remove I2C client for demodulator */ client = st->i2c_client_demod; if (client) { module_put(client->dev.driver->owner); i2c_unregister_device(client); } dvb_usb_device_exit(intf); } static struct usb_device_id cxusb_table[] = { DVB_USB_DEV(MEDION, MEDION_MD95700), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_LG064F_COLD), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_LG064F_WARM), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_DUAL_1_COLD), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_DUAL_1_WARM), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_LGZ201_COLD), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_LGZ201_WARM), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_TH7579_COLD), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_TH7579_WARM), DVB_USB_DEV(DVICO, DIGITALNOW_BLUEBIRD_DUAL_1_COLD), DVB_USB_DEV(DVICO, DIGITALNOW_BLUEBIRD_DUAL_1_WARM), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_DUAL_2_COLD), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_DUAL_2_WARM), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_DUAL_4), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_DVB_T_NANO_2), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_DVB_T_NANO_2_NFW_WARM), DVB_USB_DEV(AVERMEDIA, AVERMEDIA_VOLAR_A868R), DVB_USB_DEV(DVICO, DVICO_BLUEBIRD_DUAL_4_REV_2), DVB_USB_DEV(CONEXANT, CONEXANT_D680_DMB), DVB_USB_DEV(CONEXANT, MYGICA_D689), { } }; MODULE_DEVICE_TABLE(usb, cxusb_table); static struct dvb_usb_device_properties cxusb_medion_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = CYPRESS_FX2, .size_of_priv = sizeof(struct cxusb_medion_dev), .priv_init = cxusb_medion_priv_init, .priv_destroy = cxusb_medion_priv_destroy, .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_streaming_ctrl, .frontend_attach = cxusb_cx22702_frontend_attach, .tuner_attach = cxusb_fmd1216me_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 5, .endpoint = 0x02, .u = { .bulk = { .buffersize = 8192, } } }, } }, }, }, .power_ctrl = cxusb_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .generic_bulk_ctrl_endpoint = 0x01, .num_device_descs = 1, .devices = { { "Medion MD95700 (MDUSBTV-HYBRID)", { NULL }, { &cxusb_table[MEDION_MD95700], NULL }, }, } }; static struct dvb_usb_device_properties cxusb_bluebird_lgh064f_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = "dvb-usb-bluebird-01.fw", .download_firmware = bluebird_patch_dvico_firmware_download, /* * use usb alt setting 0 for EP4 transfer (dvb-t), * use usb alt setting 7 for EP2 transfer (atsc) */ .size_of_priv = sizeof(struct cxusb_state), .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_streaming_ctrl, .frontend_attach = cxusb_lgdt3303_frontend_attach, .tuner_attach = cxusb_lgh064f_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 5, .endpoint = 0x02, .u = { .bulk = { .buffersize = 8192, } } }, } }, }, }, .power_ctrl = cxusb_bluebird_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .rc.core = { .rc_interval = 100, .rc_codes = RC_MAP_DVICO_PORTABLE, .module_name = KBUILD_MODNAME, .rc_query = cxusb_rc_query, .allowed_protos = RC_PROTO_BIT_NEC, }, .generic_bulk_ctrl_endpoint = 0x01, .num_device_descs = 1, .devices = { { "DViCO FusionHDTV5 USB Gold", { &cxusb_table[DVICO_BLUEBIRD_LG064F_COLD], NULL }, { &cxusb_table[DVICO_BLUEBIRD_LG064F_WARM], NULL }, }, } }; static struct dvb_usb_device_properties cxusb_bluebird_dee1601_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = "dvb-usb-bluebird-01.fw", .download_firmware = bluebird_patch_dvico_firmware_download, /* * use usb alt setting 0 for EP4 transfer (dvb-t), * use usb alt setting 7 for EP2 transfer (atsc) */ .size_of_priv = sizeof(struct cxusb_state), .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_streaming_ctrl, .frontend_attach = cxusb_dee1601_frontend_attach, .tuner_attach = cxusb_dee1601_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 5, .endpoint = 0x04, .u = { .bulk = { .buffersize = 8192, } } }, } }, }, }, .power_ctrl = cxusb_bluebird_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .rc.core = { .rc_interval = 100, .rc_codes = RC_MAP_DVICO_MCE, .module_name = KBUILD_MODNAME, .rc_query = cxusb_rc_query, .allowed_protos = RC_PROTO_BIT_NEC, }, .generic_bulk_ctrl_endpoint = 0x01, .num_device_descs = 3, .devices = { { "DViCO FusionHDTV DVB-T Dual USB", { &cxusb_table[DVICO_BLUEBIRD_DUAL_1_COLD], NULL }, { &cxusb_table[DVICO_BLUEBIRD_DUAL_1_WARM], NULL }, }, { "DigitalNow DVB-T Dual USB", { &cxusb_table[DIGITALNOW_BLUEBIRD_DUAL_1_COLD], NULL }, { &cxusb_table[DIGITALNOW_BLUEBIRD_DUAL_1_WARM], NULL }, }, { "DViCO FusionHDTV DVB-T Dual Digital 2", { &cxusb_table[DVICO_BLUEBIRD_DUAL_2_COLD], NULL }, { &cxusb_table[DVICO_BLUEBIRD_DUAL_2_WARM], NULL }, }, } }; static struct dvb_usb_device_properties cxusb_bluebird_lgz201_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = "dvb-usb-bluebird-01.fw", .download_firmware = bluebird_patch_dvico_firmware_download, /* * use usb alt setting 0 for EP4 transfer (dvb-t), * use usb alt setting 7 for EP2 transfer (atsc) */ .size_of_priv = sizeof(struct cxusb_state), .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_streaming_ctrl, .frontend_attach = cxusb_mt352_frontend_attach, .tuner_attach = cxusb_lgz201_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 5, .endpoint = 0x04, .u = { .bulk = { .buffersize = 8192, } } }, } }, }, }, .power_ctrl = cxusb_bluebird_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .rc.core = { .rc_interval = 100, .rc_codes = RC_MAP_DVICO_PORTABLE, .module_name = KBUILD_MODNAME, .rc_query = cxusb_rc_query, .allowed_protos = RC_PROTO_BIT_NEC, }, .generic_bulk_ctrl_endpoint = 0x01, .num_device_descs = 1, .devices = { { "DViCO FusionHDTV DVB-T USB (LGZ201)", { &cxusb_table[DVICO_BLUEBIRD_LGZ201_COLD], NULL }, { &cxusb_table[DVICO_BLUEBIRD_LGZ201_WARM], NULL }, }, } }; static struct dvb_usb_device_properties cxusb_bluebird_dtt7579_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = "dvb-usb-bluebird-01.fw", .download_firmware = bluebird_patch_dvico_firmware_download, /* * use usb alt setting 0 for EP4 transfer (dvb-t), * use usb alt setting 7 for EP2 transfer (atsc) */ .size_of_priv = sizeof(struct cxusb_state), .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_streaming_ctrl, .frontend_attach = cxusb_mt352_frontend_attach, .tuner_attach = cxusb_dtt7579_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 5, .endpoint = 0x04, .u = { .bulk = { .buffersize = 8192, } } }, } }, }, }, .power_ctrl = cxusb_bluebird_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .rc.core = { .rc_interval = 100, .rc_codes = RC_MAP_DVICO_PORTABLE, .module_name = KBUILD_MODNAME, .rc_query = cxusb_rc_query, .allowed_protos = RC_PROTO_BIT_NEC, }, .generic_bulk_ctrl_endpoint = 0x01, .num_device_descs = 1, .devices = { { "DViCO FusionHDTV DVB-T USB (TH7579)", { &cxusb_table[DVICO_BLUEBIRD_TH7579_COLD], NULL }, { &cxusb_table[DVICO_BLUEBIRD_TH7579_WARM], NULL }, }, } }; static struct dvb_usb_device_properties cxusb_bluebird_dualdig4_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = CYPRESS_FX2, .size_of_priv = sizeof(struct cxusb_state), .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_streaming_ctrl, .frontend_attach = cxusb_dualdig4_frontend_attach, .tuner_attach = cxusb_dvico_xc3028_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 5, .endpoint = 0x02, .u = { .bulk = { .buffersize = 8192, } } }, } }, }, }, .power_ctrl = cxusb_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .generic_bulk_ctrl_endpoint = 0x01, .rc.core = { .rc_interval = 100, .rc_codes = RC_MAP_DVICO_MCE, .module_name = KBUILD_MODNAME, .rc_query = cxusb_bluebird2_rc_query, .allowed_protos = RC_PROTO_BIT_NEC, }, .num_device_descs = 1, .devices = { { "DViCO FusionHDTV DVB-T Dual Digital 4", { NULL }, { &cxusb_table[DVICO_BLUEBIRD_DUAL_4], NULL }, }, } }; static struct dvb_usb_device_properties cxusb_bluebird_nano2_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = CYPRESS_FX2, .identify_state = bluebird_fx2_identify_state, .size_of_priv = sizeof(struct cxusb_state), .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_streaming_ctrl, .frontend_attach = cxusb_nano2_frontend_attach, .tuner_attach = cxusb_dvico_xc3028_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 5, .endpoint = 0x02, .u = { .bulk = { .buffersize = 8192, } } }, } }, }, }, .power_ctrl = cxusb_nano2_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .generic_bulk_ctrl_endpoint = 0x01, .rc.core = { .rc_interval = 100, .rc_codes = RC_MAP_DVICO_PORTABLE, .module_name = KBUILD_MODNAME, .rc_query = cxusb_bluebird2_rc_query, .allowed_protos = RC_PROTO_BIT_NEC, }, .num_device_descs = 1, .devices = { { "DViCO FusionHDTV DVB-T NANO2", { NULL }, { &cxusb_table[DVICO_BLUEBIRD_DVB_T_NANO_2], NULL }, }, } }; static struct dvb_usb_device_properties cxusb_bluebird_nano2_needsfirmware_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = "dvb-usb-bluebird-02.fw", .download_firmware = bluebird_patch_dvico_firmware_download, .identify_state = bluebird_fx2_identify_state, .size_of_priv = sizeof(struct cxusb_state), .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_streaming_ctrl, .frontend_attach = cxusb_nano2_frontend_attach, .tuner_attach = cxusb_dvico_xc3028_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 5, .endpoint = 0x02, .u = { .bulk = { .buffersize = 8192, } } }, } }, }, }, .power_ctrl = cxusb_nano2_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .generic_bulk_ctrl_endpoint = 0x01, .rc.core = { .rc_interval = 100, .rc_codes = RC_MAP_DVICO_PORTABLE, .module_name = KBUILD_MODNAME, .rc_query = cxusb_rc_query, .allowed_protos = RC_PROTO_BIT_NEC, }, .num_device_descs = 1, .devices = { { "DViCO FusionHDTV DVB-T NANO2 w/o firmware", { &cxusb_table[DVICO_BLUEBIRD_DVB_T_NANO_2], NULL }, { &cxusb_table[DVICO_BLUEBIRD_DVB_T_NANO_2_NFW_WARM], NULL }, }, } }; static struct dvb_usb_device_properties cxusb_aver_a868r_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = CYPRESS_FX2, .size_of_priv = sizeof(struct cxusb_state), .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_aver_streaming_ctrl, .frontend_attach = cxusb_aver_lgdt3303_frontend_attach, .tuner_attach = cxusb_mxl5003s_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 5, .endpoint = 0x04, .u = { .bulk = { .buffersize = 8192, } } }, } }, }, }, .power_ctrl = cxusb_aver_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .generic_bulk_ctrl_endpoint = 0x01, .num_device_descs = 1, .devices = { { "AVerMedia AVerTVHD Volar (A868R)", { NULL }, { &cxusb_table[AVERMEDIA_VOLAR_A868R], NULL }, }, } }; static struct dvb_usb_device_properties cxusb_bluebird_dualdig4_rev2_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = CYPRESS_FX2, .size_of_priv = sizeof(struct cxusb_state), .num_adapters = 1, .adapter = { { .size_of_priv = sizeof(struct dib0700_adapter_state), .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_streaming_ctrl, .frontend_attach = cxusb_dualdig4_rev2_frontend_attach, .tuner_attach = cxusb_dualdig4_rev2_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 7, .endpoint = 0x02, .u = { .bulk = { .buffersize = 4096, } } }, } }, }, }, .power_ctrl = cxusb_bluebird_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .generic_bulk_ctrl_endpoint = 0x01, .rc.core = { .rc_interval = 100, .rc_codes = RC_MAP_DVICO_MCE, .module_name = KBUILD_MODNAME, .rc_query = cxusb_rc_query, .allowed_protos = RC_PROTO_BIT_NEC, }, .num_device_descs = 1, .devices = { { "DViCO FusionHDTV DVB-T Dual Digital 4 (rev 2)", { NULL }, { &cxusb_table[DVICO_BLUEBIRD_DUAL_4_REV_2], NULL }, }, } }; static struct dvb_usb_device_properties cxusb_d680_dmb_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = CYPRESS_FX2, .size_of_priv = sizeof(struct cxusb_state), .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_d680_dmb_streaming_ctrl, .frontend_attach = cxusb_d680_dmb_frontend_attach, .tuner_attach = cxusb_d680_dmb_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 5, .endpoint = 0x02, .u = { .bulk = { .buffersize = 8192, } } }, } }, }, }, .power_ctrl = cxusb_d680_dmb_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .generic_bulk_ctrl_endpoint = 0x01, .rc.core = { .rc_interval = 100, .rc_codes = RC_MAP_TOTAL_MEDIA_IN_HAND_02, .module_name = KBUILD_MODNAME, .rc_query = cxusb_d680_dmb_rc_query, .allowed_protos = RC_PROTO_BIT_UNKNOWN, }, .num_device_descs = 1, .devices = { { "Conexant DMB-TH Stick", { NULL }, { &cxusb_table[CONEXANT_D680_DMB], NULL }, }, } }; static struct dvb_usb_device_properties cxusb_mygica_d689_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = CYPRESS_FX2, .size_of_priv = sizeof(struct cxusb_state), .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = cxusb_d680_dmb_streaming_ctrl, .frontend_attach = cxusb_mygica_d689_frontend_attach, .tuner_attach = cxusb_mygica_d689_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 5, .endpoint = 0x02, .u = { .bulk = { .buffersize = 8192, } } }, } }, }, }, .power_ctrl = cxusb_d680_dmb_power_ctrl, .i2c_algo = &cxusb_i2c_algo, .generic_bulk_ctrl_endpoint = 0x01, .rc.core = { .rc_interval = 100, .rc_codes = RC_MAP_D680_DMB, .module_name = KBUILD_MODNAME, .rc_query = cxusb_d680_dmb_rc_query, .allowed_protos = RC_PROTO_BIT_UNKNOWN, }, .num_device_descs = 1, .devices = { { "Mygica D689 DMB-TH", { NULL }, { &cxusb_table[MYGICA_D689], NULL }, }, } }; static struct usb_driver cxusb_driver = { .name = "dvb_usb_cxusb", .probe = cxusb_probe, .disconnect = cxusb_disconnect, .id_table = cxusb_table, }; module_usb_driver(cxusb_driver); MODULE_AUTHOR("Patrick Boettcher <patrick.boettcher@posteo.de>"); MODULE_AUTHOR("Michael Krufky <mkrufky@linuxtv.org>"); MODULE_AUTHOR("Chris Pascoe <c.pascoe@itee.uq.edu.au>"); MODULE_AUTHOR("Maciej S. Szmigiero <mail@maciej.szmigiero.name>"); MODULE_DESCRIPTION("Driver for Conexant USB2.0 hybrid reference design"); MODULE_LICENSE("GPL"); |
4 16669 2268 53 31 53 86 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NODEMASK_H #define __LINUX_NODEMASK_H /* * Nodemasks provide a bitmap suitable for representing the * set of Node's in a system, one bit position per Node number. * * See detailed comments in the file linux/bitmap.h describing the * data type on which these nodemasks are based. * * For details of nodemask_parse_user(), see bitmap_parse_user() in * lib/bitmap.c. For details of nodelist_parse(), see bitmap_parselist(), * also in bitmap.c. For details of node_remap(), see bitmap_bitremap in * lib/bitmap.c. For details of nodes_remap(), see bitmap_remap in * lib/bitmap.c. For details of nodes_onto(), see bitmap_onto in * lib/bitmap.c. For details of nodes_fold(), see bitmap_fold in * lib/bitmap.c. * * The available nodemask operations are: * * void node_set(node, mask) turn on bit 'node' in mask * void node_clear(node, mask) turn off bit 'node' in mask * void nodes_setall(mask) set all bits * void nodes_clear(mask) clear all bits * int node_isset(node, mask) true iff bit 'node' set in mask * int node_test_and_set(node, mask) test and set bit 'node' in mask * * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] * void nodes_or(dst, src1, src2) dst = src1 | src2 [union] * void nodes_xor(dst, src1, src2) dst = src1 ^ src2 * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2 * void nodes_complement(dst, src) dst = ~src * * int nodes_equal(mask1, mask2) Does mask1 == mask2? * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2? * int nodes_empty(mask) Is mask empty (no bits sets)? * int nodes_full(mask) Is mask full (all bits sets)? * int nodes_weight(mask) Hamming weight - number of set bits * * void nodes_shift_right(dst, src, n) Shift right * void nodes_shift_left(dst, src, n) Shift left * * unsigned int first_node(mask) Number lowest set bit, or MAX_NUMNODES * unsigend int next_node(node, mask) Next node past 'node', or MAX_NUMNODES * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first, * or MAX_NUMNODES * unsigned int first_unset_node(mask) First node not set in mask, or * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set * NODE_MASK_ALL Initializer - all bits set * NODE_MASK_NONE Initializer - no bits set * unsigned long *nodes_addr(mask) Array of unsigned long's in mask * * int nodemask_parse_user(ubuf, ulen, mask) Parse ascii string as nodemask * int nodelist_parse(buf, map) Parse ascii string as nodelist * int node_remap(oldbit, old, new) newbit = map(old, new)(oldbit) * void nodes_remap(dst, src, old, new) *dst = map(old, new)(src) * void nodes_onto(dst, orig, relmap) *dst = orig relative to relmap * void nodes_fold(dst, orig, sz) dst bits = orig bits mod sz * * for_each_node_mask(node, mask) for-loop node over mask * * int num_online_nodes() Number of online Nodes * int num_possible_nodes() Number of all possible Nodes * * int node_random(mask) Random node with set bit in mask * * int node_online(node) Is some node online? * int node_possible(node) Is some node possible? * * node_set_online(node) set bit 'node' in node_online_map * node_set_offline(node) clear bit 'node' in node_online_map * * for_each_node(node) for-loop node over node_possible_map * for_each_online_node(node) for-loop node over node_online_map * * Subtlety: * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway) * to generate slightly worse code. So use a simple one-line #define * for node_isset(), instead of wrapping an inline inside a macro, the * way we do the other calls. * * NODEMASK_SCRATCH * When doing above logical AND, OR, XOR, Remap operations the callers tend to * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large, * nodemask_t's consume too much stack space. NODEMASK_SCRATCH is a helper * for such situations. See below and CPUMASK_ALLOC also. */ #include <linux/threads.h> #include <linux/bitmap.h> #include <linux/minmax.h> #include <linux/nodemask_types.h> #include <linux/numa.h> #include <linux/random.h> extern nodemask_t _unused_nodemask_arg_; /** * nodemask_pr_args - printf args to output a nodemask * @maskp: nodemask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. */ #define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ __nodemask_pr_bits(maskp) static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) { return m ? MAX_NUMNODES : 0; } static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) { return m ? m->bits : NULL; } /* * The inline keyword gives the compiler room to decide to inline, or * not inline a function as it sees best. However, as these functions * are called in both __init and non-__init functions, if they are not * inlined we will end up with a section mismatch error (of the type of * freeable items not being freed). So we must use __always_inline here * to fix the problem. If other functions in the future also end up in * this situation they will also need to be annotated as __always_inline */ #define node_set(node, dst) __node_set((node), &(dst)) static __always_inline void __node_set(int node, volatile nodemask_t *dstp) { set_bit(node, dstp->bits); } #define node_clear(node, dst) __node_clear((node), &(dst)) static inline void __node_clear(int node, volatile nodemask_t *dstp) { clear_bit(node, dstp->bits); } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } /* No static inline type checking - see Subtlety (1) above. */ #define node_isset(node, nodemask) test_bit((node), (nodemask).bits) #define node_test_and_set(node, nodemask) \ __node_test_and_set((node), &(nodemask)) static inline bool __node_test_and_set(int node, nodemask_t *addr) { return test_and_set_bit(node, addr->bits); } #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static inline void __nodes_complement(nodemask_t *dstp, const nodemask_t *srcp, unsigned int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); } #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) static inline bool __nodes_equal(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); } #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) static inline bool __nodes_intersects(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); } #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) static inline bool __nodes_subset(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } #define nodes_shift_right(dst, src, n) \ __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) static inline void __nodes_shift_right(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); } #define nodes_shift_left(dst, src, n) \ __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) static inline void __nodes_shift_left(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); } /* FIXME: better would be to fix all architectures to never return > MAX_NUMNODES, then the silly min_ts could be dropped. */ #define first_node(src) __first_node(&(src)) static inline unsigned int __first_node(const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); } #define next_node(n, src) __next_node((n), &(src)) static inline unsigned int __next_node(int n, const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } /* * Find the next present node in src, starting after node n, wrapping around to * the first node in src if needed. Returns MAX_NUMNODES if src is empty. */ #define next_node_in(n, src) __next_node_in((n), &(src)) static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) { unsigned int ret = __next_node(node, srcp); if (ret == MAX_NUMNODES) ret = __first_node(srcp); return ret; } static inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); node_set(node, *mask); } #define nodemask_of_node(node) \ ({ \ typeof(_unused_nodemask_arg_) m; \ if (sizeof(m) == sizeof(unsigned long)) { \ m.bits[0] = 1UL << (node); \ } else { \ init_nodemask_of_node(&m, (node)); \ } \ m; \ }) #define first_unset_node(mask) __first_unset_node(&(mask)) static inline unsigned int __first_unset_node(const nodemask_t *maskp) { return min_t(unsigned int, MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); } #define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) #if MAX_NUMNODES <= BITS_PER_LONG #define NODE_MASK_ALL \ ((nodemask_t) { { \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #else #define NODE_MASK_ALL \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL, \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #endif #define NODE_MASK_NONE \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] = 0UL \ } }) #define nodes_addr(src) ((src).bits) #define nodemask_parse_user(ubuf, ulen, dst) \ __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) static inline int __nodemask_parse_user(const char __user *buf, int len, nodemask_t *dstp, int nbits) { return bitmap_parse_user(buf, len, dstp->bits, nbits); } #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) { return bitmap_parselist(buf, dstp->bits, nbits); } #define node_remap(oldbit, old, new) \ __node_remap((oldbit), &(old), &(new), MAX_NUMNODES) static inline int __node_remap(int oldbit, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); } #define nodes_remap(dst, src, old, new) \ __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES) static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); } #define nodes_onto(dst, orig, relmap) \ __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES) static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, const nodemask_t *relmapp, int nbits) { bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits); } #define nodes_fold(dst, orig, sz) \ __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES) static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, int sz, int nbits) { bitmap_fold(dstp->bits, origp->bits, sz, nbits); } #if MAX_NUMNODES > 1 #define for_each_node_mask(node, mask) \ for ((node) = first_node(mask); \ (node) < MAX_NUMNODES; \ (node) = next_node((node), (mask))) #else /* MAX_NUMNODES == 1 */ #define for_each_node_mask(node, mask) \ for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++) #endif /* MAX_NUMNODES */ /* * Bitmasks that are kept for all the nodes. */ enum node_states { N_POSSIBLE, /* The node could become online at some point */ N_ONLINE, /* The node is online */ N_NORMAL_MEMORY, /* The node has regular memory */ #ifdef CONFIG_HIGHMEM N_HIGH_MEMORY, /* The node has regular or high memory */ #else N_HIGH_MEMORY = N_NORMAL_MEMORY, #endif N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ NR_NODE_STATES }; /* * The following particular system nodemasks and operations * on them manage all possible and online nodes. */ extern nodemask_t node_states[NR_NODE_STATES]; #if MAX_NUMNODES > 1 static inline int node_state(int node, enum node_states state) { return node_isset(node, node_states[state]); } static inline void node_set_state(int node, enum node_states state) { __node_set(node, &node_states[state]); } static inline void node_clear_state(int node, enum node_states state) { __node_clear(node, &node_states[state]); } static inline int num_node_state(enum node_states state) { return nodes_weight(node_states[state]); } #define for_each_node_state(__node, __state) \ for_each_node_mask((__node), node_states[__state]) #define first_online_node first_node(node_states[N_ONLINE]) #define first_memory_node first_node(node_states[N_MEMORY]) static inline unsigned int next_online_node(int nid) { return next_node(nid, node_states[N_ONLINE]); } static inline unsigned int next_memory_node(int nid) { return next_node(nid, node_states[N_MEMORY]); } extern unsigned int nr_node_ids; extern unsigned int nr_online_nodes; static inline void node_set_online(int nid) { node_set_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } static inline void node_set_offline(int nid) { node_clear_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } #else static inline int node_state(int node, enum node_states state) { return node == 0; } static inline void node_set_state(int node, enum node_states state) { } static inline void node_clear_state(int node, enum node_states state) { } static inline int num_node_state(enum node_states state) { return 1; } #define for_each_node_state(node, __state) \ for ( (node) = 0; (node) == 0; (node) = 1) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define next_memory_node(nid) (MAX_NUMNODES) #define nr_node_ids 1U #define nr_online_nodes 1U #define node_set_online(node) node_set_state((node), N_ONLINE) #define node_set_offline(node) node_clear_state((node), N_ONLINE) #endif static inline int node_random(const nodemask_t *maskp) { #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) int w, bit; w = nodes_weight(*maskp); switch (w) { case 0: bit = NUMA_NO_NODE; break; case 1: bit = first_node(*maskp); break; default: bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_u32_below(w)); break; } return bit; #else return 0; #endif } #define node_online_map node_states[N_ONLINE] #define node_possible_map node_states[N_POSSIBLE] #define num_online_nodes() num_node_state(N_ONLINE) #define num_possible_nodes() num_node_state(N_POSSIBLE) #define node_online(node) node_state((node), N_ONLINE) #define node_possible(node) node_state((node), N_POSSIBLE) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) /* * For nodemask scratch area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and * name. */ #if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */ #define NODEMASK_ALLOC(type, name, gfp_flags) \ type *name = kmalloc(sizeof(*name), gfp_flags) #define NODEMASK_FREE(m) kfree(m) #else #define NODEMASK_ALLOC(type, name, gfp_flags) type _##name, *name = &_##name #define NODEMASK_FREE(m) do {} while (0) #endif /* Example structure for using NODEMASK_ALLOC, used in mempolicy. */ struct nodemask_scratch { nodemask_t mask1; nodemask_t mask2; }; #define NODEMASK_SCRATCH(x) \ NODEMASK_ALLOC(struct nodemask_scratch, x, \ GFP_KERNEL | __GFP_NORETRY) #define NODEMASK_SCRATCH_FREE(x) NODEMASK_FREE(x) #endif /* __LINUX_NODEMASK_H */ |
40253 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM x86_fpu #if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FPU_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(x86_fpu, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu), TP_STRUCT__entry( __field(struct fpu *, fpu) __field(bool, load_fpu) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures; __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv; } ), TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->load_fpu, __entry->xfeatures, __entry->xcomp_bv ) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_restore, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_restore, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_src, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH asm/trace/ #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE fpu #endif /* _TRACE_FPU_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
1054 351 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FIB_LOOKUP_H #define _FIB_LOOKUP_H #include <linux/types.h> #include <linux/list.h> #include <net/inet_dscp.h> #include <net/ip_fib.h> #include <net/nexthop.h> struct fib_alias { struct hlist_node fa_list; struct fib_info *fa_info; dscp_t fa_dscp; u8 fa_type; u8 fa_state; u8 fa_slen; u32 tb_id; s16 fa_default; u8 offload; u8 trap; u8 offload_failed; struct rcu_head rcu; }; #define FA_S_ACCESSED 0x01 /* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { if (!(fa->fa_state & FA_S_ACCESSED)) fa->fa_state |= FA_S_ACCESSED; } /* Exported by fib_semantics.c */ void fib_release_info(struct fib_info *); struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack); int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, const struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); size_t fib_nlmsg_size(struct fib_info *fi); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) { /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; res->nhc = fib_info_nhc(fi, 0); } struct fib_prop { int error; u8 scope; }; extern const struct fib_prop fib_props[RTN_MAX + 1]; #endif /* _FIB_LOOKUP_H */ |
7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 | /* * llc_c_ev.c - Connection component state transition event qualifiers * * A 'state' consists of a number of possible event matching functions, * the actions associated with each being executed when that event is * matched; a 'state machine' accepts events in a serial fashion from an * event queue. Each event is passed to each successive event matching * function until a match is made (the event matching function returns * success, or '0') or the list of event matching functions is exhausted. * If a match is made, the actions associated with the event are executed * and the state is changed to that event's transition state. Before some * events are recognized, even after a match has been made, a certain * number of 'event qualifier' functions must also be executed. If these * all execute successfully, then the event is finally executed. * * These event functions must return 0 for success, to show a matched * event, of 1 if the event does not match. Event qualifier functions * must return a 0 for success or a non-zero for failure. Each function * is simply responsible for verifying one single thing and returning * either a success or failure. * * All of followed event functions are described in 802.2 LLC Protocol * standard document except two functions that we added that will explain * in their comments, at below. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <net/llc_conn.h> #include <net/llc_sap.h> #include <net/sock.h> #include <net/llc_c_ac.h> #include <net/llc_c_ev.h> #include <net/llc_pdu.h> #if 1 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif /** * llc_util_ns_inside_rx_window - check if sequence number is in rx window * @ns: sequence number of received pdu. * @vr: sequence number which receiver expects to receive. * @rw: receive window size of receiver. * * Checks if sequence number of received PDU is in range of receive * window. Returns 0 for success, 1 otherwise */ static u16 llc_util_ns_inside_rx_window(u8 ns, u8 vr, u8 rw) { return !llc_circular_between(vr, ns, (vr + rw - 1) % LLC_2_SEQ_NBR_MODULO); } /** * llc_util_nr_inside_tx_window - check if sequence number is in tx window * @sk: current connection. * @nr: N(R) of received PDU. * * This routine checks if N(R) of received PDU is in range of transmit * window; on the other hand checks if received PDU acknowledges some * outstanding PDUs that are in transmit window. Returns 0 for success, 1 * otherwise. */ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr) { u8 nr1, nr2; struct sk_buff *skb; struct llc_pdu_sn *pdu; struct llc_sock *llc = llc_sk(sk); int rc = 0; if (llc->dev->flags & IFF_LOOPBACK) goto out; rc = 1; if (skb_queue_empty(&llc->pdu_unack_q)) goto out; skb = skb_peek(&llc->pdu_unack_q); pdu = llc_pdu_sn_hdr(skb); nr1 = LLC_I_GET_NS(pdu); skb = skb_peek_tail(&llc->pdu_unack_q); pdu = llc_pdu_sn_hdr(skb); nr2 = LLC_I_GET_NS(pdu); rc = !llc_circular_between(nr1, nr, (nr2 + 1) % LLC_2_SEQ_NBR_MODULO); out: return rc; } int llc_conn_ev_conn_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_CONN_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_data_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_DATA_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_disc_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_DISC_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_rst_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_RESET_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_local_busy_detected(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_DETECTED ? 0 : 1; } int llc_conn_ev_local_busy_cleared(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_CLEARED ? 0 : 1; } int llc_conn_ev_rx_bad_pdu(struct sock *sk, struct sk_buff *skb) { return 1; } int llc_conn_ev_rx_disc_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC ? 0 : 1; } int llc_conn_ev_rx_dm_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM ? 0 : 1; } int llc_conn_ev_rx_frmr_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn * pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); const u16 rc = LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; if (!rc) dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", __func__, llc_sk(sk)->state, ns, vr); return rc; } int llc_conn_ev_rx_i_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); const u16 rc = LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; if (!rc) dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", __func__, llc_sk(sk)->state, ns, vr); return rc; } int llc_conn_ev_rx_rej_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rnr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; } int llc_conn_ev_rx_rr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; } int llc_conn_ev_rx_rr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; } int llc_conn_ev_rx_rr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; } int llc_conn_ev_rx_rr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; } int llc_conn_ev_rx_sabme_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME ? 0 : 1; } int llc_conn_ev_rx_ua_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_UA ? 0 : 1; } int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if (LLC_PDU_IS_CMD(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) { if (LLC_I_PF_IS_1(pdu)) rc = 0; } else if (LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PF_IS_1(pdu)) rc = 0; } return rc; } int llc_conn_ev_rx_xxx_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (LLC_PDU_IS_CMD(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) rc = 0; else if (LLC_PDU_TYPE_IS_U(pdu)) switch (LLC_U_PDU_CMD(pdu)) { case LLC_2_PDU_CMD_SABME: case LLC_2_PDU_CMD_DISC: rc = 0; break; } } return rc; } int llc_conn_ev_rx_xxx_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (LLC_PDU_IS_RSP(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) rc = 0; else if (LLC_PDU_TYPE_IS_U(pdu)) switch (LLC_U_PDU_RSP(pdu)) { case LLC_2_PDU_RSP_UA: case LLC_2_PDU_RSP_DM: case LLC_2_PDU_RSP_FRMR: rc = 0; break; } } return rc; } int llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vs = llc_sk(sk)->vS; const u8 nr = LLC_I_GET_NR(pdu); if (LLC_PDU_IS_CMD(pdu) && (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", __func__, llc_sk(sk)->state, vs, nr); rc = 0; } return rc; } int llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vs = llc_sk(sk)->vS; const u8 nr = LLC_I_GET_NR(pdu); if (LLC_PDU_IS_RSP(pdu) && (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { rc = 0; dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", __func__, llc_sk(sk)->state, vs, nr); } return rc; } int llc_conn_ev_rx_any_frame(struct sock *sk, struct sk_buff *skb) { return 0; } int llc_conn_ev_p_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_P_TMR; } int llc_conn_ev_ack_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_ACK_TMR; } int llc_conn_ev_rej_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_REJ_TMR; } int llc_conn_ev_busy_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_BUSY_TMR; } int llc_conn_ev_init_p_f_cycle(struct sock *sk, struct sk_buff *skb) { return 1; } int llc_conn_ev_tx_buffer_full(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_TX_BUFF_FULL ? 0 : 1; } /* Event qualifier functions * * these functions simply verify the value of a state flag associated with * the connection and return either a 0 for success or a non-zero value * for not-success; verify the event is the type we expect */ int llc_conn_ev_qlfy_data_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag != 1; } int llc_conn_ev_qlfy_data_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag; } int llc_conn_ev_qlfy_data_flag_eq_2(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag != 2; } int llc_conn_ev_qlfy_p_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->p_flag != 1; } /** * llc_conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window * @sk: current connection structure. * @skb: current event. * * This function determines when frame which is sent, is last frame of * transmit window, if it is then this function return zero else return * one. This function is used for sending last frame of transmit window * as I-format command with p-bit set to one. Returns 0 if frame is last * frame, 1 otherwise. */ int llc_conn_ev_qlfy_last_frame_eq_1(struct sock *sk, struct sk_buff *skb) { return !(skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k); } /** * llc_conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window * @sk: current connection structure. * @skb: current event. * * This function determines when frame which is sent, isn't last frame of * transmit window, if it isn't then this function return zero else return * one. Returns 0 if frame isn't last frame, 1 otherwise. */ int llc_conn_ev_qlfy_last_frame_eq_0(struct sock *sk, struct sk_buff *skb) { return skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k; } int llc_conn_ev_qlfy_p_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->p_flag; } int llc_conn_ev_qlfy_p_flag_eq_f(struct sock *sk, struct sk_buff *skb) { u8 f_bit; llc_pdu_decode_pf_bit(skb, &f_bit); return llc_sk(sk)->p_flag == f_bit ? 0 : 1; } int llc_conn_ev_qlfy_remote_busy_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->remote_busy_flag; } int llc_conn_ev_qlfy_remote_busy_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->remote_busy_flag; } int llc_conn_ev_qlfy_retry_cnt_lt_n2(struct sock *sk, struct sk_buff *skb) { return !(llc_sk(sk)->retry_count < llc_sk(sk)->n2); } int llc_conn_ev_qlfy_retry_cnt_gte_n2(struct sock *sk, struct sk_buff *skb) { return !(llc_sk(sk)->retry_count >= llc_sk(sk)->n2); } int llc_conn_ev_qlfy_s_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->s_flag; } int llc_conn_ev_qlfy_s_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->s_flag; } int llc_conn_ev_qlfy_cause_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->cause_flag; } int llc_conn_ev_qlfy_cause_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->cause_flag; } int llc_conn_ev_qlfy_set_status_conn(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_CONN; return 0; } int llc_conn_ev_qlfy_set_status_disc(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_DISC; return 0; } int llc_conn_ev_qlfy_set_status_failed(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_FAILED; return 0; } int llc_conn_ev_qlfy_set_status_remote_busy(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_REMOTE_BUSY; return 0; } int llc_conn_ev_qlfy_set_status_refuse(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_REFUSE; return 0; } int llc_conn_ev_qlfy_set_status_conflict(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_CONFLICT; return 0; } int llc_conn_ev_qlfy_set_status_rst_done(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_RESET_DONE; return 0; } |
589 589 588 589 3 1 2 11 3 2 8 2 24 23 21 9 2 1 1 91 88 2 1 54 49 2 1 3 2 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/crypto/hooks.c * * Encryption hooks for higher-level filesystem operations. */ #include "fscrypt_private.h" /** * fscrypt_file_open() - prepare to open a possibly-encrypted regular file * @inode: the inode being opened * @filp: the struct file being set up * * Currently, an encrypted regular file can only be opened if its encryption key * is available; access to the raw encrypted contents is not supported. * Therefore, we first set up the inode's encryption key (if not already done) * and return an error if it's unavailable. * * We also verify that if the parent directory (from the path via which the file * is being opened) is encrypted, then the inode being opened uses the same * encryption policy. This is needed as part of the enforcement that all files * in an encrypted directory tree use the same encryption policy, as a * protection against certain types of offline attacks. Note that this check is * needed even when opening an *unencrypted* file, since it's forbidden to have * an unencrypted file in an encrypted directory. * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code */ int fscrypt_file_open(struct inode *inode, struct file *filp) { int err; struct dentry *dir; err = fscrypt_require_key(inode); if (err) return err; dir = dget_parent(file_dentry(filp)); if (IS_ENCRYPTED(d_inode(dir)) && !fscrypt_has_permitted_context(d_inode(dir), inode)) { fscrypt_warn(inode, "Inconsistent encryption context (parent directory: %lu)", d_inode(dir)->i_ino); err = -EPERM; } dput(dir); return err; } EXPORT_SYMBOL_GPL(fscrypt_file_open); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; /* * We don't need to separately check that the directory inode's key is * available, as it's implied by the dentry not being a no-key name. */ if (!fscrypt_has_permitted_context(dir, inode)) return -EXDEV; return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_link); int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (fscrypt_is_nokey_name(old_dentry) || fscrypt_is_nokey_name(new_dentry)) return -ENOKEY; /* * We don't need to separately check that the directory inodes' keys are * available, as it's implied by the dentries not being no-key names. */ if (old_dir != new_dir) { if (IS_ENCRYPTED(new_dir) && !fscrypt_has_permitted_context(new_dir, d_inode(old_dentry))) return -EXDEV; if ((flags & RENAME_EXCHANGE) && IS_ENCRYPTED(old_dir) && !fscrypt_has_permitted_context(old_dir, d_inode(new_dentry))) return -EXDEV; } return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename); int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { int err = fscrypt_setup_filename(dir, &dentry->d_name, 1, fname); if (err && err != -ENOENT) return err; fscrypt_prepare_dentry(dentry, fname->is_nokey_name); return err; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); /** * fscrypt_prepare_lookup_partial() - prepare lookup without filename setup * @dir: the encrypted directory being searched * @dentry: the dentry being looked up in @dir * * This function should be used by the ->lookup and ->atomic_open methods of * filesystems that handle filename encryption and no-key name encoding * themselves and thus can't use fscrypt_prepare_lookup(). Like * fscrypt_prepare_lookup(), this will try to set up the directory's encryption * key and will set DCACHE_NOKEY_NAME on the dentry if the key is unavailable. * However, this function doesn't set up a struct fscrypt_name for the filename. * * Return: 0 on success; -errno on error. Note that the encryption key being * unavailable is not considered an error. It is also not an error if * the encryption policy is unsupported by this kernel; that is treated * like the key being unavailable, so that files can still be deleted. */ int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { int err = fscrypt_get_encryption_info(dir, true); bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir)); fscrypt_prepare_dentry(dentry, is_nokey_name); return err; } EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial); int __fscrypt_prepare_readdir(struct inode *dir) { return fscrypt_get_encryption_info(dir, true); } EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir); int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (attr->ia_valid & ATTR_SIZE) return fscrypt_require_key(d_inode(dentry)); return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr); /** * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS * @inode: the inode on which flags are being changed * @oldflags: the old flags * @flags: the new flags * * The caller should be holding i_rwsem for write. * * Return: 0 on success; -errno if the flags change isn't allowed or if * another error occurs. */ int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { struct fscrypt_inode_info *ci; struct fscrypt_master_key *mk; int err; /* * When the CASEFOLD flag is set on an encrypted directory, we must * derive the secret key needed for the dirhash. This is only possible * if the directory uses a v2 encryption policy. */ if (IS_ENCRYPTED(inode) && (flags & ~oldflags & FS_CASEFOLD_FL)) { err = fscrypt_require_key(inode); if (err) return err; ci = inode->i_crypt_info; if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; down_read(&mk->mk_sem); if (mk->mk_present) err = fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; up_read(&mk->mk_sem); return err; } return 0; } /** * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink * @dir: directory in which the symlink is being created * @target: plaintext symlink target * @len: length of @target excluding null terminator * @max_len: space the filesystem has available to store the symlink target * @disk_link: (out) the on-disk symlink target being prepared * * This function computes the size the symlink target will require on-disk, * stores it in @disk_link->len, and validates it against @max_len. An * encrypted symlink may be longer than the original. * * Additionally, @disk_link->name is set to @target if the symlink will be * unencrypted, but left NULL if the symlink will be encrypted. For encrypted * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the * on-disk target later. (The reason for the two-step process is that some * filesystems need to know the size of the symlink target before creating the * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.) * * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long, * -ENOKEY if the encryption key is missing, or another -errno code if a problem * occurred while setting up the encryption key. */ int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) { const union fscrypt_policy *policy; /* * To calculate the size of the encrypted symlink target we need to know * the amount of NUL padding, which is determined by the flags set in * the encryption policy which will be inherited from the directory. */ policy = fscrypt_policy_to_inherit(dir); if (policy == NULL) { /* Not encrypted */ disk_link->name = (unsigned char *)target; disk_link->len = len + 1; if (disk_link->len > max_len) return -ENAMETOOLONG; return 0; } if (IS_ERR(policy)) return PTR_ERR(policy); /* * Calculate the size of the encrypted symlink and verify it won't * exceed max_len. Note that for historical reasons, encrypted symlink * targets are prefixed with the ciphertext length, despite this * actually being redundant with i_size. This decreases by 2 bytes the * longest symlink target we can accept. * * We could recover 1 byte by not counting a null terminator, but * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ if (!__fscrypt_fname_encrypted_size(policy, len, max_len - sizeof(struct fscrypt_symlink_data) - 1, &disk_link->len)) return -ENAMETOOLONG; disk_link->len += sizeof(struct fscrypt_symlink_data) + 1; disk_link->name = NULL; return 0; } EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { int err; struct qstr iname = QSTR_INIT(target, len); struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; /* * fscrypt_prepare_new_inode() should have already set up the new * symlink inode's encryption key. We don't wait until now to do it, * since we may be in a filesystem transaction now. */ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode))) return -ENOKEY; if (disk_link->name) { /* filesystem-provided buffer */ sd = (struct fscrypt_symlink_data *)disk_link->name; } else { sd = kmalloc(disk_link->len, GFP_NOFS); if (!sd) return -ENOMEM; } ciphertext_len = disk_link->len - sizeof(*sd) - 1; sd->len = cpu_to_le16(ciphertext_len); err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len); if (err) goto err_free_sd; /* * Null-terminating the ciphertext doesn't make sense, but we still * count the null terminator in the length, so we might as well * initialize it just in case the filesystem writes it out. */ sd->encrypted_path[ciphertext_len] = '\0'; /* Cache the plaintext symlink target for later use by get_link() */ err = -ENOMEM; inode->i_link = kmemdup(target, len + 1, GFP_NOFS); if (!inode->i_link) goto err_free_sd; if (!disk_link->name) disk_link->name = (unsigned char *)sd; return 0; err_free_sd: if (!disk_link->name) kfree(sd); return err; } EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); /** * fscrypt_get_symlink() - get the target of an encrypted symlink * @inode: the symlink inode * @caddr: the on-disk contents of the symlink * @max_size: size of @caddr buffer * @done: if successful, will be set up to free the returned target if needed * * If the symlink's encryption key is available, we decrypt its target. * Otherwise, we encode its target for presentation. * * This may sleep, so the filesystem must have dropped out of RCU mode already. * * Return: the presentable symlink target or an ERR_PTR() */ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done) { const struct fscrypt_symlink_data *sd; struct fscrypt_str cstr, pstr; bool has_key; int err; /* This is for encrypted symlinks only */ if (WARN_ON_ONCE(!IS_ENCRYPTED(inode))) return ERR_PTR(-EINVAL); /* If the decrypted target is already cached, just return it. */ pstr.name = READ_ONCE(inode->i_link); if (pstr.name) return pstr.name; /* * Try to set up the symlink's encryption key, but we can continue * regardless of whether the key is available or not. */ err = fscrypt_get_encryption_info(inode, false); if (err) return ERR_PTR(err); has_key = fscrypt_has_encryption_key(inode); /* * For historical reasons, encrypted symlink targets are prefixed with * the ciphertext length, even though this is redundant with i_size. */ if (max_size < sizeof(*sd) + 1) return ERR_PTR(-EUCLEAN); sd = caddr; cstr.name = (unsigned char *)sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); if (cstr.len == 0) return ERR_PTR(-EUCLEAN); if (cstr.len + sizeof(*sd) > max_size) return ERR_PTR(-EUCLEAN); err = fscrypt_fname_alloc_buffer(cstr.len, &pstr); if (err) return ERR_PTR(err); err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (err) goto err_kfree; err = -EUCLEAN; if (pstr.name[0] == '\0') goto err_kfree; pstr.name[pstr.len] = '\0'; /* * Cache decrypted symlink targets in i_link for later use. Don't cache * symlink targets encoded without the key, since those become outdated * once the key is added. This pairs with the READ_ONCE() above and in * the VFS path lookup code. */ if (!has_key || cmpxchg_release(&inode->i_link, NULL, pstr.name) != NULL) set_delayed_call(done, kfree_link, pstr.name); return pstr.name; err_kfree: kfree(pstr.name); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(fscrypt_get_symlink); /** * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks * @path: the path for the encrypted symlink being queried * @stat: the struct being filled with the symlink's attributes * * Override st_size of encrypted symlinks to be the length of the decrypted * symlink target (or the no-key encoded symlink target, if the key is * unavailable) rather than the length of the encrypted symlink target. This is * necessary for st_size to match the symlink target that userspace actually * sees. POSIX requires this, and some userspace programs depend on it. * * This requires reading the symlink target from disk if needed, setting up the * inode's encryption key if possible, and then decrypting or encoding the * symlink target. This makes lstat() more heavyweight than is normally the * case. However, decrypted symlink targets will be cached in ->i_link, so * usually the symlink won't have to be read and decrypted again later if/when * it is actually followed, readlink() is called, or lstat() is called again. * * Return: 0 on success, -errno on failure */ int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) { struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); const char *link; DEFINE_DELAYED_CALL(done); /* * To get the symlink target that userspace will see (whether it's the * decrypted target or the no-key encoded target), we can just get it in * the same way the VFS does during path resolution and readlink(). */ link = READ_ONCE(inode->i_link); if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } stat->size = strlen(link); do_delayed_call(&done); return 0; } EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr); |
22 6 18 29 29 32 32 2 2 22 1 1 2 7 12 3 11 14 12 2 9 16 16 16 16 6 7 6 26 26 24 2 19 4 22 3 3 2 18 5 21 3 11 4 3 16 5 5 4 24 4 18 8 6 23 23 26 1 25 24 24 9 9 9 16 6 9 5 11 2 11 10 10 10 11 16 13 2 16 12 2 10 3 1 1 1 2 7 2 1 4 4 21 21 21 2 19 21 21 11 11 20 20 2 21 8 33 1 3 5 2 22 6 16 11 1 1 3 3 5 3 3 9 1 9 9 3 1 2 1 1 2 2 3 4 1 3 8 3 38 38 5 4 4 2 5 9 3 2 3 2 3 2 4 2 2 13 1 2 4 2 2 2 5 2 2 20 1 2 1 1 13 15 2 12 2 3 1 3 3 8 3 5 9 7 12 29 29 22 3 1 2 11 7 1 17 14 14 1 13 4 5 8 2 11 16 16 3 6 3 4 4 3 34 1 2 1 4 14 16 4 1 1 12 12 4 5 2 7 2 7 7 3 1 4 4 48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 | // SPDX-License-Identifier: GPL-2.0-only /* * VMware vSockets Driver * * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. */ /* Implementation notes: * * - There are two kinds of sockets: those created by user action (such as * calling socket(2)) and those created by incoming connection request packets. * * - There are two "global" tables, one for bound sockets (sockets that have * specified an address that they are responsible for) and one for connected * sockets (sockets that have established a connection with another socket). * These tables are "global" in that all sockets on the system are placed * within them. - Note, though, that the bound table contains an extra entry * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in * that list. The bound table is used solely for lookup of sockets when packets * are received and that's not necessary for SOCK_DGRAM sockets since we create * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM * sockets out of the bound hash buckets will reduce the chance of collisions * when looking for SOCK_STREAM sockets and prevents us from having to check the * socket type in the hash table lookups. * * - Sockets created by user action will either be "client" sockets that * initiate a connection or "server" sockets that listen for connections; we do * not support simultaneous connects (two "client" sockets connecting). * * - "Server" sockets are referred to as listener sockets throughout this * implementation because they are in the TCP_LISTEN state. When a * connection request is received (the second kind of socket mentioned above), * we create a new socket and refer to it as a pending socket. These pending * sockets are placed on the pending connection list of the listener socket. * When future packets are received for the address the listener socket is * bound to, we check if the source of the packet is from one that has an * existing pending connection. If it does, we process the packet for the * pending socket. When that socket reaches the connected state, it is removed * from the listener socket's pending list and enqueued in the listener * socket's accept queue. Callers of accept(2) will accept connected sockets * from the listener socket's accept queue. If the socket cannot be accepted * for some reason then it is marked rejected. Once the connection is * accepted, it is owned by the user process and the responsibility for cleanup * falls with that user process. * * - It is possible that these pending sockets will never reach the connected * state; in fact, we may never receive another packet after the connection * request. Because of this, we must schedule a cleanup function to run in the * future, after some amount of time passes where a connection should have been * established. This function ensures that the socket is off all lists so it * cannot be retrieved, then drops all references to the socket so it is cleaned * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this * function will also cleanup rejected sockets, those that reach the connected * state but leave it before they have been accepted. * * - Lock ordering for pending or accept queue sockets is: * * lock_sock(listener); * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); * * Using explicit nested locking keeps lockdep happy since normally only one * lock of a given class may be taken at a time. * * - Sockets created by user action will be cleaned up when the user process * calls close(2), causing our release implementation to be called. Our release * implementation will perform some cleanup then drop the last reference so our * sk_destruct implementation is invoked. Our sk_destruct implementation will * perform additional cleanup that's common for both types of sockets. * * - A socket's reference count is what ensures that the structure won't be * freed. Each entry in a list (such as the "global" bound and connected tables * and the listener socket's pending list and connected queue) ensures a * reference. When we defer work until process context and pass a socket as our * argument, we must ensure the reference count is increased to ensure the * socket isn't freed before the function is run; the deferred function will * then drop the reference. * * - sk->sk_state uses the TCP state constants because they are widely used by * other address families and exposed to userspace tools like ss(8): * * TCP_CLOSE - unconnected * TCP_SYN_SENT - connecting * TCP_ESTABLISHED - connected * TCP_CLOSING - disconnecting * TCP_LISTEN - listening */ #include <linux/compat.h> #include <linux/types.h> #include <linux/bitops.h> #include <linux/cred.h> #include <linux/errqueue.h> #include <linux/init.h> #include <linux/io.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/net.h> #include <linux/poll.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/smp.h> #include <linux/socket.h> #include <linux/stddef.h> #include <linux/unistd.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <net/sock.h> #include <net/af_vsock.h> #include <uapi/linux/vm_sockets.h> static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static void vsock_sk_destruct(struct sock *sk); static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); /* Protocol family. */ struct proto vsock_proto = { .name = "AF_VSOCK", .owner = THIS_MODULE, .obj_size = sizeof(struct vsock_sock), #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = vsock_bpf_update_proto, #endif }; /* The default peer timeout indicates how long we will wait for a peer response * to a control message. */ #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) #define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256) #define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256) #define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128 /* Transport used for host->guest communication */ static const struct vsock_transport *transport_h2g; /* Transport used for guest->host communication */ static const struct vsock_transport *transport_g2h; /* Transport used for DGRAM communication */ static const struct vsock_transport *transport_dgram; /* Transport used for local communication */ static const struct vsock_transport *transport_local; static DEFINE_MUTEX(vsock_register_mutex); /**** UTILS ****/ /* Each bound VSocket is stored in the bind hash table and each connected * VSocket is stored in the connected hash table. * * Unbound sockets are all put on the same list attached to the end of the hash * table (vsock_unbound_sockets). Bound sockets are added to the hash table in * the bucket that their local address hashes to (vsock_bound_sockets(addr) * represents the list that addr hashes to). * * Specifically, we initialize the vsock_bind_table array to a size of * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function * mods with VSOCK_HASH_SIZE to ensure this. */ #define MAX_PORT_RETRIES 24 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) /* XXX This can probably be implemented in a better way. */ #define VSOCK_CONN_HASH(src, dst) \ (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE) #define vsock_connected_sockets(src, dst) \ (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) #define vsock_connected_sockets_vsk(vsk) \ vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; EXPORT_SYMBOL_GPL(vsock_bind_table); struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; EXPORT_SYMBOL_GPL(vsock_connected_table); DEFINE_SPINLOCK(vsock_table_lock); EXPORT_SYMBOL_GPL(vsock_table_lock); /* Autobind this socket to the local address if necessary. */ static int vsock_auto_bind(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); struct sockaddr_vm local_addr; if (vsock_addr_bound(&vsk->local_addr)) return 0; vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); return __vsock_bind(sk, &local_addr); } static void vsock_init_tables(void) { int i; for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) INIT_LIST_HEAD(&vsock_bind_table[i]); for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) INIT_LIST_HEAD(&vsock_connected_table[i]); } static void __vsock_insert_bound(struct list_head *list, struct vsock_sock *vsk) { sock_hold(&vsk->sk); list_add(&vsk->bound_table, list); } static void __vsock_insert_connected(struct list_head *list, struct vsock_sock *vsk) { sock_hold(&vsk->sk); list_add(&vsk->connected_table, list); } static void __vsock_remove_bound(struct vsock_sock *vsk) { list_del_init(&vsk->bound_table); sock_put(&vsk->sk); } static void __vsock_remove_connected(struct vsock_sock *vsk) { list_del_init(&vsk->connected_table); sock_put(&vsk->sk); } static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { if (vsock_addr_equals_addr(addr, &vsk->local_addr)) return sk_vsock(vsk); if (addr->svm_port == vsk->local_addr.svm_port && (vsk->local_addr.svm_cid == VMADDR_CID_ANY || addr->svm_cid == VMADDR_CID_ANY)) return sk_vsock(vsk); } return NULL; } static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_connected_sockets(src, dst), connected_table) { if (vsock_addr_equals_addr(src, &vsk->remote_addr) && dst->svm_port == vsk->local_addr.svm_port) { return sk_vsock(vsk); } } return NULL; } static void vsock_insert_unbound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); __vsock_insert_bound(vsock_unbound_sockets, vsk); spin_unlock_bh(&vsock_table_lock); } void vsock_insert_connected(struct vsock_sock *vsk) { struct list_head *list = vsock_connected_sockets( &vsk->remote_addr, &vsk->local_addr); spin_lock_bh(&vsock_table_lock); __vsock_insert_connected(list, vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_insert_connected); void vsock_remove_bound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); if (__vsock_in_bound_table(vsk)) __vsock_remove_bound(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_bound); void vsock_remove_connected(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); if (__vsock_in_connected_table(vsk)) __vsock_remove_connected(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_connected); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) { struct sock *sk; spin_lock_bh(&vsock_table_lock); sk = __vsock_find_bound_socket(addr); if (sk) sock_hold(sk); spin_unlock_bh(&vsock_table_lock); return sk; } EXPORT_SYMBOL_GPL(vsock_find_bound_socket); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst) { struct sock *sk; spin_lock_bh(&vsock_table_lock); sk = __vsock_find_connected_socket(src, dst); if (sk) sock_hold(sk); spin_unlock_bh(&vsock_table_lock); return sk; } EXPORT_SYMBOL_GPL(vsock_find_connected_socket); void vsock_remove_sock(struct vsock_sock *vsk) { vsock_remove_bound(vsk); vsock_remove_connected(vsk); } EXPORT_SYMBOL_GPL(vsock_remove_sock); void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)) { int i; spin_lock_bh(&vsock_table_lock); for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { struct vsock_sock *vsk; list_for_each_entry(vsk, &vsock_connected_table[i], connected_table) { if (vsk->transport != transport) continue; fn(sk_vsock(vsk)); } } spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); void vsock_add_pending(struct sock *listener, struct sock *pending) { struct vsock_sock *vlistener; struct vsock_sock *vpending; vlistener = vsock_sk(listener); vpending = vsock_sk(pending); sock_hold(pending); sock_hold(listener); list_add_tail(&vpending->pending_links, &vlistener->pending_links); } EXPORT_SYMBOL_GPL(vsock_add_pending); void vsock_remove_pending(struct sock *listener, struct sock *pending) { struct vsock_sock *vpending = vsock_sk(pending); list_del_init(&vpending->pending_links); sock_put(listener); sock_put(pending); } EXPORT_SYMBOL_GPL(vsock_remove_pending); void vsock_enqueue_accept(struct sock *listener, struct sock *connected) { struct vsock_sock *vlistener; struct vsock_sock *vconnected; vlistener = vsock_sk(listener); vconnected = vsock_sk(connected); sock_hold(connected); sock_hold(listener); list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); } EXPORT_SYMBOL_GPL(vsock_enqueue_accept); static bool vsock_use_local_transport(unsigned int remote_cid) { if (!transport_local) return false; if (remote_cid == VMADDR_CID_LOCAL) return true; if (transport_g2h) { return remote_cid == transport_g2h->get_local_cid(); } else { return remote_cid == VMADDR_CID_HOST; } } static void vsock_deassign_transport(struct vsock_sock *vsk) { if (!vsk->transport) return; vsk->transport->destruct(vsk); module_put(vsk->transport->module); vsk->transport = NULL; } /* Assign a transport to a socket and call the .init transport callback. * * Note: for connection oriented socket this must be called when vsk->remote_addr * is set (e.g. during the connect() or when a connection request on a listener * socket is received). * The vsk->remote_addr is used to decide which transport to use: * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if * g2h is not loaded, will use local transport; * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport; * - remote CID > VMADDR_CID_HOST will use host->guest transport; */ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) { const struct vsock_transport *new_transport; struct sock *sk = sk_vsock(vsk); unsigned int remote_cid = vsk->remote_addr.svm_cid; __u8 remote_flags; int ret; /* If the packet is coming with the source and destination CIDs higher * than VMADDR_CID_HOST, then a vsock channel where all the packets are * forwarded to the host should be established. Then the host will * need to forward the packets to the guest. * * The flag is set on the (listen) receive path (psk is not NULL). On * the connect path the flag can be set by the user space application. */ if (psk && vsk->local_addr.svm_cid > VMADDR_CID_HOST && vsk->remote_addr.svm_cid > VMADDR_CID_HOST) vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST; remote_flags = vsk->remote_addr.svm_flags; switch (sk->sk_type) { case SOCK_DGRAM: new_transport = transport_dgram; break; case SOCK_STREAM: case SOCK_SEQPACKET: if (vsock_use_local_transport(remote_cid)) new_transport = transport_local; else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || (remote_flags & VMADDR_FLAG_TO_HOST)) new_transport = transport_g2h; else new_transport = transport_h2g; break; default: return -ESOCKTNOSUPPORT; } if (vsk->transport) { if (vsk->transport == new_transport) return 0; /* transport->release() must be called with sock lock acquired. * This path can only be taken during vsock_connect(), where we * have already held the sock lock. In the other cases, this * function is called on a new socket which is not assigned to * any transport. */ vsk->transport->release(vsk); vsock_deassign_transport(vsk); } /* We increase the module refcnt to prevent the transport unloading * while there are open sockets assigned to it. */ if (!new_transport || !try_module_get(new_transport->module)) return -ENODEV; if (sk->sk_type == SOCK_SEQPACKET) { if (!new_transport->seqpacket_allow || !new_transport->seqpacket_allow(remote_cid)) { module_put(new_transport->module); return -ESOCKTNOSUPPORT; } } ret = new_transport->init(vsk, psk); if (ret) { module_put(new_transport->module); return ret; } vsk->transport = new_transport; return 0; } EXPORT_SYMBOL_GPL(vsock_assign_transport); bool vsock_find_cid(unsigned int cid) { if (transport_g2h && cid == transport_g2h->get_local_cid()) return true; if (transport_h2g && cid == VMADDR_CID_HOST) return true; if (transport_local && cid == VMADDR_CID_LOCAL) return true; return false; } EXPORT_SYMBOL_GPL(vsock_find_cid); static struct sock *vsock_dequeue_accept(struct sock *listener) { struct vsock_sock *vlistener; struct vsock_sock *vconnected; vlistener = vsock_sk(listener); if (list_empty(&vlistener->accept_queue)) return NULL; vconnected = list_entry(vlistener->accept_queue.next, struct vsock_sock, accept_queue); list_del_init(&vconnected->accept_queue); sock_put(listener); /* The caller will need a reference on the connected socket so we let * it call sock_put(). */ return sk_vsock(vconnected); } static bool vsock_is_accept_queue_empty(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); return list_empty(&vsk->accept_queue); } static bool vsock_is_pending(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); return !list_empty(&vsk->pending_links); } static int vsock_send_shutdown(struct sock *sk, int mode) { struct vsock_sock *vsk = vsock_sk(sk); if (!vsk->transport) return -ENODEV; return vsk->transport->shutdown(vsk, mode); } static void vsock_pending_work(struct work_struct *work) { struct sock *sk; struct sock *listener; struct vsock_sock *vsk; bool cleanup; vsk = container_of(work, struct vsock_sock, pending_work.work); sk = sk_vsock(vsk); listener = vsk->listener; cleanup = true; lock_sock(listener); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (vsock_is_pending(sk)) { vsock_remove_pending(listener, sk); sk_acceptq_removed(listener); } else if (!vsk->rejected) { /* We are not on the pending list and accept() did not reject * us, so we must have been accepted by our user process. We * just need to drop our references to the sockets and be on * our way. */ cleanup = false; goto out; } /* We need to remove ourself from the global connected sockets list so * incoming packets can't find this socket, and to reduce the reference * count. */ vsock_remove_connected(vsk); sk->sk_state = TCP_CLOSE; out: release_sock(sk); release_sock(listener); if (cleanup) sock_put(sk); sock_put(sk); sock_put(listener); } /**** SOCKET OPERATIONS ****/ static int __vsock_bind_connectible(struct vsock_sock *vsk, struct sockaddr_vm *addr) { static u32 port; struct sockaddr_vm new_addr; if (!port) port = get_random_u32_above(LAST_RESERVED_PORT); vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); if (addr->svm_port == VMADDR_PORT_ANY) { bool found = false; unsigned int i; for (i = 0; i < MAX_PORT_RETRIES; i++) { if (port <= LAST_RESERVED_PORT) port = LAST_RESERVED_PORT + 1; new_addr.svm_port = port++; if (!__vsock_find_bound_socket(&new_addr)) { found = true; break; } } if (!found) return -EADDRNOTAVAIL; } else { /* If port is in reserved range, ensure caller * has necessary privileges. */ if (addr->svm_port <= LAST_RESERVED_PORT && !capable(CAP_NET_BIND_SERVICE)) { return -EACCES; } if (__vsock_find_bound_socket(&new_addr)) return -EADDRINUSE; } vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); /* Remove connection oriented sockets from the unbound list and add them * to the hash table for easy lookup by its address. The unbound list * is simply an extra entry at the end of the hash table, a trick used * by AF_UNIX. */ __vsock_remove_bound(vsk); __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); return 0; } static int __vsock_bind_dgram(struct vsock_sock *vsk, struct sockaddr_vm *addr) { return vsk->transport->dgram_bind(vsk, addr); } static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) { struct vsock_sock *vsk = vsock_sk(sk); int retval; /* First ensure this socket isn't already bound. */ if (vsock_addr_bound(&vsk->local_addr)) return -EINVAL; /* Now bind to the provided address or select appropriate values if * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that * like AF_INET prevents binding to a non-local IP address (in most * cases), we only allow binding to a local CID. */ if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid)) return -EADDRNOTAVAIL; switch (sk->sk_socket->type) { case SOCK_STREAM: case SOCK_SEQPACKET: spin_lock_bh(&vsock_table_lock); retval = __vsock_bind_connectible(vsk, addr); spin_unlock_bh(&vsock_table_lock); break; case SOCK_DGRAM: retval = __vsock_bind_dgram(vsk, addr); break; default: retval = -EINVAL; break; } return retval; } static void vsock_connect_timeout(struct work_struct *work); static struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, gfp_t priority, unsigned short type, int kern) { struct sock *sk; struct vsock_sock *psk; struct vsock_sock *vsk; sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); if (!sk) return NULL; sock_init_data(sock, sk); /* sk->sk_type is normally set in sock_init_data, but only if sock is * non-NULL. We make sure that our sockets always have a type by * setting it here if needed. */ if (!sock) sk->sk_type = type; vsk = vsock_sk(sk); vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); sk->sk_destruct = vsock_sk_destruct; sk->sk_backlog_rcv = vsock_queue_rcv_skb; sock_reset_flag(sk, SOCK_DONE); INIT_LIST_HEAD(&vsk->bound_table); INIT_LIST_HEAD(&vsk->connected_table); vsk->listener = NULL; INIT_LIST_HEAD(&vsk->pending_links); INIT_LIST_HEAD(&vsk->accept_queue); vsk->rejected = false; vsk->sent_request = false; vsk->ignore_connecting_rst = false; vsk->peer_shutdown = 0; INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); psk = parent ? vsock_sk(parent) : NULL; if (parent) { vsk->trusted = psk->trusted; vsk->owner = get_cred(psk->owner); vsk->connect_timeout = psk->connect_timeout; vsk->buffer_size = psk->buffer_size; vsk->buffer_min_size = psk->buffer_min_size; vsk->buffer_max_size = psk->buffer_max_size; security_sk_clone(parent, sk); } else { vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); vsk->owner = get_current_cred(); vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE; vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE; } return sk; } static bool sock_type_connectible(u16 type) { return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET); } static void __vsock_release(struct sock *sk, int level) { if (sk) { struct sock *pending; struct vsock_sock *vsk; vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ /* When "level" is SINGLE_DEPTH_NESTING, use the nested * version to avoid the warning "possible recursive locking * detected". When "level" is 0, lock_sock_nested(sk, level) * is the same as lock_sock(sk). */ lock_sock_nested(sk, level); if (vsk->transport) vsk->transport->release(vsk); else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; skb_queue_purge(&sk->sk_receive_queue); /* Clean up any sockets that never were accepted. */ while ((pending = vsock_dequeue_accept(sk)) != NULL) { __vsock_release(pending, SINGLE_DEPTH_NESTING); sock_put(pending); } release_sock(sk); sock_put(sk); } } static void vsock_sk_destruct(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); vsock_deassign_transport(vsk); /* When clearing these addresses, there's no need to set the family and * possibly register the address family with the kernel. */ vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); put_cred(vsk->owner); } static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; err = sock_queue_rcv_skb(sk, skb); if (err) kfree_skb(skb); return err; } struct sock *vsock_create_connected(struct sock *parent) { return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL, parent->sk_type, 0); } EXPORT_SYMBOL_GPL(vsock_create_connected); s64 vsock_stream_has_data(struct vsock_sock *vsk) { return vsk->transport->stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_data); s64 vsock_connectible_has_data(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); if (sk->sk_type == SOCK_SEQPACKET) return vsk->transport->seqpacket_has_data(vsk); else return vsock_stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_connectible_has_data); s64 vsock_stream_has_space(struct vsock_sock *vsk) { return vsk->transport->stream_has_space(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_space); void vsock_data_ready(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); if (vsock_stream_has_data(vsk) >= sk->sk_rcvlowat || sock_flag(sk, SOCK_DONE)) sk->sk_data_ready(sk); } EXPORT_SYMBOL_GPL(vsock_data_ready); static int vsock_release(struct socket *sock) { __vsock_release(sock->sk, 0); sock->sk = NULL; sock->state = SS_FREE; return 0; } static int vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { int err; struct sock *sk; struct sockaddr_vm *vm_addr; sk = sock->sk; if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) return -EINVAL; lock_sock(sk); err = __vsock_bind(sk, vm_addr); release_sock(sk); return err; } static int vsock_getname(struct socket *sock, struct sockaddr *addr, int peer) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *vm_addr; sk = sock->sk; vsk = vsock_sk(sk); err = 0; lock_sock(sk); if (peer) { if (sock->state != SS_CONNECTED) { err = -ENOTCONN; goto out; } vm_addr = &vsk->remote_addr; } else { vm_addr = &vsk->local_addr; } if (!vm_addr) { err = -EINVAL; goto out; } /* sys_getsockname() and sys_getpeername() pass us a * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately * that macro is defined in socket.c instead of .h, so we hardcode its * value here. */ BUILD_BUG_ON(sizeof(*vm_addr) > 128); memcpy(addr, vm_addr, sizeof(*vm_addr)); err = sizeof(*vm_addr); out: release_sock(sk); return err; } static int vsock_shutdown(struct socket *sock, int mode) { int err; struct sock *sk; /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode * here like the other address families do. Note also that the * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), * which is what we want. */ mode++; if ((mode & ~SHUTDOWN_MASK) || !mode) return -EINVAL; /* If this is a connection oriented socket and it is not connected then * bail out immediately. If it is a DGRAM socket then we must first * kick the socket so that it wakes up from any sleeping calls, for * example recv(), and then afterwards return the error. */ sk = sock->sk; lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; if (sock_type_connectible(sk->sk_type)) goto out; } else { sock->state = SS_DISCONNECTING; err = 0; } /* Receive and send shutdowns are treated alike. */ mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); if (mode) { sk->sk_shutdown |= mode; sk->sk_state_change(sk); if (sock_type_connectible(sk->sk_type)) { sock_reset_flag(sk, SOCK_DONE); vsock_send_shutdown(sk, mode); } } out: release_sock(sk); return err; } static __poll_t vsock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk; __poll_t mask; struct vsock_sock *vsk; sk = sock->sk; vsk = vsock_sk(sk); poll_wait(file, sk_sleep(sk), wait); mask = 0; if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) /* Signify that there has been an error on this socket. */ mask |= EPOLLERR; /* INET sockets treat local write shutdown and peer write shutdown as a * case of EPOLLHUP set. */ if ((sk->sk_shutdown == SHUTDOWN_MASK) || ((sk->sk_shutdown & SEND_SHUTDOWN) && (vsk->peer_shutdown & SEND_SHUTDOWN))) { mask |= EPOLLHUP; } if (sk->sk_shutdown & RCV_SHUTDOWN || vsk->peer_shutdown & SEND_SHUTDOWN) { mask |= EPOLLRDHUP; } if (sock->type == SOCK_DGRAM) { /* For datagram sockets we can read if there is something in * the queue and write as long as the socket isn't shutdown for * sending. */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) { mask |= EPOLLIN | EPOLLRDNORM; } if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; } else if (sock_type_connectible(sk->sk_type)) { const struct vsock_transport *transport; lock_sock(sk); transport = vsk->transport; /* Listening sockets that have connections in their accept * queue can be read. */ if (sk->sk_state == TCP_LISTEN && !vsock_is_accept_queue_empty(sk)) mask |= EPOLLIN | EPOLLRDNORM; /* If there is something in the queue then we can read. */ if (transport && transport->stream_is_active(vsk) && !(sk->sk_shutdown & RCV_SHUTDOWN)) { bool data_ready_now = false; int target = sock_rcvlowat(sk, 0, INT_MAX); int ret = transport->notify_poll_in( vsk, target, &data_ready_now); if (ret < 0) { mask |= EPOLLERR; } else { if (data_ready_now) mask |= EPOLLIN | EPOLLRDNORM; } } /* Sockets whose connections have been closed, reset, or * terminated should also be considered read, and we check the * shutdown flag for that. */ if (sk->sk_shutdown & RCV_SHUTDOWN || vsk->peer_shutdown & SEND_SHUTDOWN) { mask |= EPOLLIN | EPOLLRDNORM; } /* Connected sockets that can produce data can be written. */ if (transport && sk->sk_state == TCP_ESTABLISHED) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { bool space_avail_now = false; int ret = transport->notify_poll_out( vsk, 1, &space_avail_now); if (ret < 0) { mask |= EPOLLERR; } else { if (space_avail_now) /* Remove EPOLLWRBAND since INET * sockets are not setting it. */ mask |= EPOLLOUT | EPOLLWRNORM; } } } /* Simulate INET socket poll behaviors, which sets * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= EPOLLOUT | EPOLLWRNORM; } release_sock(sk); } return mask; } static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) { struct vsock_sock *vsk = vsock_sk(sk); return vsk->transport->read_skb(vsk, read_actor); } static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *remote_addr; const struct vsock_transport *transport; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* For now, MSG_DONTWAIT is always assumed... */ err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); transport = vsk->transport; err = vsock_auto_bind(vsk); if (err) goto out; /* If the provided message contains an address, use that. Otherwise * fall back on the socket's remote handle (if it has been connected). */ if (msg->msg_name && vsock_addr_cast(msg->msg_name, msg->msg_namelen, &remote_addr) == 0) { /* Ensure this address is of the right type and is a valid * destination. */ if (remote_addr->svm_cid == VMADDR_CID_ANY) remote_addr->svm_cid = transport->get_local_cid(); if (!vsock_addr_bound(remote_addr)) { err = -EINVAL; goto out; } } else if (sock->state == SS_CONNECTED) { remote_addr = &vsk->remote_addr; if (remote_addr->svm_cid == VMADDR_CID_ANY) remote_addr->svm_cid = transport->get_local_cid(); /* XXX Should connect() or this function ensure remote_addr is * bound? */ if (!vsock_addr_bound(&vsk->remote_addr)) { err = -EINVAL; goto out; } } else { err = -EINVAL; goto out; } if (!transport->dgram_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; } err = transport->dgram_enqueue(vsk, remote_addr, msg, len); out: release_sock(sk); return err; } static int vsock_dgram_connect(struct socket *sock, struct sockaddr *addr, int addr_len, int flags) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *remote_addr; sk = sock->sk; vsk = vsock_sk(sk); err = vsock_addr_cast(addr, addr_len, &remote_addr); if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { lock_sock(sk); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); sock->state = SS_UNCONNECTED; release_sock(sk); return 0; } else if (err != 0) return -EINVAL; lock_sock(sk); err = vsock_auto_bind(vsk); if (err) goto out; if (!vsk->transport->dgram_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; } memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); sock->state = SS_CONNECTED; /* sock map disallows redirection of non-TCP sockets with sk_state != * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams. * * This doesn't seem to be abnormal state for datagram sockets, as the * same approach can be see in other datagram socket types as well * (such as unix sockets). */ sk->sk_state = TCP_ESTABLISHED; out: release_sock(sk); return err; } int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { #ifdef CONFIG_BPF_SYSCALL const struct proto *prot; #endif struct vsock_sock *vsk; struct sock *sk; sk = sock->sk; vsk = vsock_sk(sk); #ifdef CONFIG_BPF_SYSCALL prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) return prot->recvmsg(sk, msg, len, flags, NULL); #endif return vsk->transport->dgram_dequeue(vsk, msg, len, flags); } EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); static const struct proto_ops vsock_dgram_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, .sendmsg = vsock_dgram_sendmsg, .recvmsg = vsock_dgram_recvmsg, .mmap = sock_no_mmap, .read_skb = vsock_read_skb, }; static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) { const struct vsock_transport *transport = vsk->transport; if (!transport || !transport->cancel_pkt) return -EOPNOTSUPP; return transport->cancel_pkt(vsk); } static void vsock_connect_timeout(struct work_struct *work) { struct sock *sk; struct vsock_sock *vsk; vsk = container_of(work, struct vsock_sock, connect_work.work); sk = sk_vsock(vsk); lock_sock(sk); if (sk->sk_state == TCP_SYN_SENT && (sk->sk_shutdown != SHUTDOWN_MASK)) { sk->sk_state = TCP_CLOSE; sk->sk_socket->state = SS_UNCONNECTED; sk->sk_err = ETIMEDOUT; sk_error_report(sk); vsock_transport_cancel_pkt(vsk); } release_sock(sk); sock_put(sk); } static int vsock_connect(struct socket *sock, struct sockaddr *addr, int addr_len, int flags) { int err; struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; struct sockaddr_vm *remote_addr; long timeout; DEFINE_WAIT(wait); err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ switch (sock->state) { case SS_CONNECTED: err = -EISCONN; goto out; case SS_DISCONNECTING: err = -EINVAL; goto out; case SS_CONNECTING: /* This continues on so we can move sock into the SS_CONNECTED * state once the connection has completed (at which point err * will be set to zero also). Otherwise, we will either wait * for the connection or return -EALREADY should this be a * non-blocking call. */ err = -EALREADY; if (flags & O_NONBLOCK) goto out; break; default: if ((sk->sk_state == TCP_LISTEN) || vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { err = -EINVAL; goto out; } /* Set the remote address that we are connecting to. */ memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); err = vsock_assign_transport(vsk, NULL); if (err) goto out; transport = vsk->transport; /* The hypervisor and well-known contexts do not have socket * endpoints. */ if (!transport || !transport->stream_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -ENETUNREACH; goto out; } if (vsock_msgzerocopy_allow(transport)) { set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); } else if (sock_flag(sk, SOCK_ZEROCOPY)) { /* If this option was set before 'connect()', * when transport was unknown, check that this * feature is supported here. */ err = -EOPNOTSUPP; goto out; } err = vsock_auto_bind(vsk); if (err) goto out; sk->sk_state = TCP_SYN_SENT; err = transport->connect(vsk); if (err < 0) goto out; /* Mark sock as connecting and set the error code to in * progress in case this is a non-blocking connect. */ sock->state = SS_CONNECTING; err = -EINPROGRESS; } /* The receive path will handle all communication until we are able to * enter the connected state. Here we wait for the connection to be * completed or a notification of an error. */ timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection * attempt, in case the peer doesn't respond in a * timely manner. We hold on to the socket until the * timeout fires. */ sock_hold(sk); /* If the timeout function is already scheduled, * reschedule it, then ungrab the socket refcount to * keep it balanced. */ if (mod_delayed_work(system_wq, &vsk->connect_work, timeout)) sock_put(sk); /* Skip ahead to preserve error code set above. */ goto out_wait; } release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); vsock_remove_connected(vsk); goto out_wait; } else if ((sk->sk_state != TCP_ESTABLISHED) && (timeout == 0)) { err = -ETIMEDOUT; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } if (sk->sk_err) { err = -sk->sk_err; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; } else { err = 0; } out_wait: finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; } static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *listener; int err; struct sock *connected; struct vsock_sock *vconnected; long timeout; DEFINE_WAIT(wait); err = 0; listener = sock->sk; lock_sock(listener); if (!sock_type_connectible(sock->type)) { err = -EOPNOTSUPP; goto out; } if (listener->sk_state != TCP_LISTEN) { err = -EINVAL; goto out; } /* Wait for children sockets to appear; these are the new sockets * created upon connection establishment. */ timeout = sock_rcvtimeo(listener, flags & O_NONBLOCK); prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); while ((connected = vsock_dequeue_accept(listener)) == NULL && listener->sk_err == 0) { release_sock(listener); timeout = schedule_timeout(timeout); finish_wait(sk_sleep(listener), &wait); lock_sock(listener); if (signal_pending(current)) { err = sock_intr_errno(timeout); goto out; } else if (timeout == 0) { err = -EAGAIN; goto out; } prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); } finish_wait(sk_sleep(listener), &wait); if (listener->sk_err) err = -listener->sk_err; if (connected) { sk_acceptq_removed(listener); lock_sock_nested(connected, SINGLE_DEPTH_NESTING); vconnected = vsock_sk(connected); /* If the listener socket has received an error, then we should * reject this socket and return. Note that we simply mark the * socket rejected, drop our reference, and let the cleanup * function handle the cleanup; the fact that we found it in * the listener's accept queue guarantees that the cleanup * function hasn't run yet. */ if (err) { vconnected->rejected = true; } else { newsock->state = SS_CONNECTED; sock_graft(connected, newsock); if (vsock_msgzerocopy_allow(vconnected->transport)) set_bit(SOCK_SUPPORT_ZC, &connected->sk_socket->flags); } release_sock(connected); sock_put(connected); } out: release_sock(listener); return err; } static int vsock_listen(struct socket *sock, int backlog) { int err; struct sock *sk; struct vsock_sock *vsk; sk = sock->sk; lock_sock(sk); if (!sock_type_connectible(sk->sk_type)) { err = -EOPNOTSUPP; goto out; } if (sock->state != SS_UNCONNECTED) { err = -EINVAL; goto out; } vsk = vsock_sk(sk); if (!vsock_addr_bound(&vsk->local_addr)) { err = -EINVAL; goto out; } sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; err = 0; out: release_sock(sk); return err; } static void vsock_update_buffer_size(struct vsock_sock *vsk, const struct vsock_transport *transport, u64 val) { if (val > vsk->buffer_max_size) val = vsk->buffer_max_size; if (val < vsk->buffer_min_size) val = vsk->buffer_min_size; if (val != vsk->buffer_size && transport && transport->notify_buffer_size) transport->notify_buffer_size(vsk, &val); vsk->buffer_size = val; } static int vsock_connectible_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; u64 val; if (level != AF_VSOCK && level != SOL_SOCKET) return -ENOPROTOOPT; #define COPY_IN(_v) \ do { \ if (optlen < sizeof(_v)) { \ err = -EINVAL; \ goto exit; \ } \ if (copy_from_sockptr(&_v, optval, sizeof(_v)) != 0) { \ err = -EFAULT; \ goto exit; \ } \ } while (0) err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); transport = vsk->transport; if (level == SOL_SOCKET) { int zerocopy; if (optname != SO_ZEROCOPY) { release_sock(sk); return sock_setsockopt(sock, level, optname, optval, optlen); } /* Use 'int' type here, because variable to * set this option usually has this type. */ COPY_IN(zerocopy); if (zerocopy < 0 || zerocopy > 1) { err = -EINVAL; goto exit; } if (transport && !vsock_msgzerocopy_allow(transport)) { err = -EOPNOTSUPP; goto exit; } sock_valbool_flag(sk, SOCK_ZEROCOPY, zerocopy); goto exit; } switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: COPY_IN(val); vsock_update_buffer_size(vsk, transport, val); break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: COPY_IN(val); vsk->buffer_max_size = val; vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: COPY_IN(val); vsk->buffer_min_size = val; vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: { struct __kernel_sock_timeval tv; err = sock_copy_user_timeval(&tv, optval, optlen, optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); if (err) break; if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { vsk->connect_timeout = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, (USEC_PER_SEC / HZ)); if (vsk->connect_timeout == 0) vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; } else { err = -ERANGE; } break; } default: err = -ENOPROTOOPT; break; } #undef COPY_IN exit: release_sock(sk); return err; } static int vsock_connectible_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct vsock_sock *vsk = vsock_sk(sk); union { u64 val64; struct old_timeval32 tm32; struct __kernel_old_timeval tm; struct __kernel_sock_timeval stm; } v; int lv = sizeof(v.val64); int len; if (level != AF_VSOCK) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; memset(&v, 0, sizeof(v)); switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: v.val64 = vsk->buffer_size; break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: v.val64 = vsk->buffer_max_size; break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: v.val64 = vsk->buffer_min_size; break; case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: lv = sock_get_timeout(vsk->connect_timeout, &v, optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); break; default: return -ENOPROTOOPT; } if (len < lv) return -EINVAL; if (len > lv) len = lv; if (copy_to_user(optval, &v, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; ssize_t total_written; long timeout; int err; struct vsock_transport_send_notify_data send_data; DEFINE_WAIT_FUNC(wait, woken_wake_function); sk = sock->sk; vsk = vsock_sk(sk); total_written = 0; err = 0; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; lock_sock(sk); transport = vsk->transport; /* Callers should not provide a destination with connection oriented * sockets. */ if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out; } /* Send data only if both sides are not shutdown in the direction. */ if (sk->sk_shutdown & SEND_SHUTDOWN || vsk->peer_shutdown & RCV_SHUTDOWN) { err = -EPIPE; goto out; } if (!transport || sk->sk_state != TCP_ESTABLISHED || !vsock_addr_bound(&vsk->local_addr)) { err = -ENOTCONN; goto out; } if (!vsock_addr_bound(&vsk->remote_addr)) { err = -EDESTADDRREQ; goto out; } if (msg->msg_flags & MSG_ZEROCOPY && !vsock_msgzerocopy_allow(transport)) { err = -EOPNOTSUPP; goto out; } /* Wait for room in the produce queue to enqueue our user's data. */ timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); err = transport->notify_send_init(vsk, &send_data); if (err < 0) goto out; while (total_written < len) { ssize_t written; add_wait_queue(sk_sleep(sk), &wait); while (vsock_stream_has_space(vsk) == 0 && sk->sk_err == 0 && !(sk->sk_shutdown & SEND_SHUTDOWN) && !(vsk->peer_shutdown & RCV_SHUTDOWN)) { /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } err = transport->notify_send_pre_block(vsk, &send_data); if (err < 0) { remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } release_sock(sk); timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } else if (timeout == 0) { err = -EAGAIN; remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } } remove_wait_queue(sk_sleep(sk), &wait); /* These checks occur both as part of and after the loop * conditional since we need to check before and after * sleeping. */ if (sk->sk_err) { err = -sk->sk_err; goto out_err; } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || (vsk->peer_shutdown & RCV_SHUTDOWN)) { err = -EPIPE; goto out_err; } err = transport->notify_send_pre_enqueue(vsk, &send_data); if (err < 0) goto out_err; /* Note that enqueue will only write as many bytes as are free * in the produce queue, so we don't need to ensure len is * smaller than the queue size. It is the caller's * responsibility to check how many bytes we were able to send. */ if (sk->sk_type == SOCK_SEQPACKET) { written = transport->seqpacket_enqueue(vsk, msg, len - total_written); } else { written = transport->stream_enqueue(vsk, msg, len - total_written); } if (written < 0) { err = written; goto out_err; } total_written += written; err = transport->notify_send_post_enqueue( vsk, written, &send_data); if (err < 0) goto out_err; } out_err: if (total_written > 0) { /* Return number of written bytes only if: * 1) SOCK_STREAM socket. * 2) SOCK_SEQPACKET socket when whole buffer is sent. */ if (sk->sk_type == SOCK_STREAM || total_written == len) err = total_written; } out: if (sk->sk_type == SOCK_STREAM) err = sk_stream_error(sk, msg->msg_flags, err); release_sock(sk); return err; } static int vsock_connectible_wait_data(struct sock *sk, struct wait_queue_entry *wait, long timeout, struct vsock_transport_recv_notify_data *recv_data, size_t target) { const struct vsock_transport *transport; struct vsock_sock *vsk; s64 data; int err; vsk = vsock_sk(sk); err = 0; transport = vsk->transport; while (1) { prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); data = vsock_connectible_has_data(vsk); if (data != 0) break; if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN) || (vsk->peer_shutdown & SEND_SHUTDOWN)) { break; } /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; break; } if (recv_data) { err = transport->notify_recv_pre_block(vsk, target, recv_data); if (err < 0) break; } release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); break; } else if (timeout == 0) { err = -EAGAIN; break; } } finish_wait(sk_sleep(sk), wait); if (err) return err; /* Internal transport error when checking for available * data. XXX This should be changed to a connection * reset in a later change. */ if (data < 0) return -ENOMEM; return data; } static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct vsock_transport_recv_notify_data recv_data; const struct vsock_transport *transport; struct vsock_sock *vsk; ssize_t copied; size_t target; long timeout; int err; DEFINE_WAIT(wait); vsk = vsock_sk(sk); transport = vsk->transport; /* We must not copy less than target bytes into the user's buffer * before returning successfully, so we wait for the consume queue to * have that much data to consume before dequeueing. Note that this * makes it impossible to handle cases where target is greater than the * queue size. */ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); if (target >= transport->stream_rcvhiwat(vsk)) { err = -ENOMEM; goto out; } timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); copied = 0; err = transport->notify_recv_init(vsk, target, &recv_data); if (err < 0) goto out; while (1) { ssize_t read; err = vsock_connectible_wait_data(sk, &wait, timeout, &recv_data, target); if (err <= 0) break; err = transport->notify_recv_pre_dequeue(vsk, target, &recv_data); if (err < 0) break; read = transport->stream_dequeue(vsk, msg, len - copied, flags); if (read < 0) { err = read; break; } copied += read; err = transport->notify_recv_post_dequeue(vsk, target, read, !(flags & MSG_PEEK), &recv_data); if (err < 0) goto out; if (read >= target || flags & MSG_PEEK) break; target -= read; } if (sk->sk_err) err = -sk->sk_err; else if (sk->sk_shutdown & RCV_SHUTDOWN) err = 0; if (copied > 0) err = copied; out: return err; } static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { const struct vsock_transport *transport; struct vsock_sock *vsk; ssize_t msg_len; long timeout; int err = 0; DEFINE_WAIT(wait); vsk = vsock_sk(sk); transport = vsk->transport; timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0); if (err <= 0) goto out; msg_len = transport->seqpacket_dequeue(vsk, msg, flags); if (msg_len < 0) { err = msg_len; goto out; } if (sk->sk_err) { err = -sk->sk_err; } else if (sk->sk_shutdown & RCV_SHUTDOWN) { err = 0; } else { /* User sets MSG_TRUNC, so return real length of * packet. */ if (flags & MSG_TRUNC) err = msg_len; else err = len - msg_data_left(msg); /* Always set MSG_TRUNC if real length of packet is * bigger than user's buffer. */ if (msg_len > len) msg->msg_flags |= MSG_TRUNC; } out: return err; } int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; #ifdef CONFIG_BPF_SYSCALL const struct proto *prot; #endif int err; sk = sock->sk; if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR); vsk = vsock_sk(sk); err = 0; lock_sock(sk); transport = vsk->transport; if (!transport || sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occurred with the * SOCK_DONE flag. */ if (sock_flag(sk, SOCK_DONE)) err = 0; else err = -ENOTCONN; goto out; } if (flags & MSG_OOB) { err = -EOPNOTSUPP; goto out; } /* We don't check peer_shutdown flag here since peer may actually shut * down, but there can be data in the queue that a local socket can * receive. */ if (sk->sk_shutdown & RCV_SHUTDOWN) { err = 0; goto out; } /* It is valid on Linux to pass in a zero-length receive buffer. This * is not an error. We may as well bail out now. */ if (!len) { err = 0; goto out; } #ifdef CONFIG_BPF_SYSCALL prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) { release_sock(sk); return prot->recvmsg(sk, msg, len, flags, NULL); } #endif if (sk->sk_type == SOCK_STREAM) err = __vsock_stream_recvmsg(sk, msg, len, flags); else err = __vsock_seqpacket_recvmsg(sk, msg, len, flags); out: release_sock(sk); return err; } EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg); static int vsock_set_rcvlowat(struct sock *sk, int val) { const struct vsock_transport *transport; struct vsock_sock *vsk; vsk = vsock_sk(sk); if (val > vsk->buffer_size) return -EINVAL; transport = vsk->transport; if (transport && transport->notify_set_rcvlowat) { int err; err = transport->notify_set_rcvlowat(vsk, val); if (err) return err; } WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); return 0; } static const struct proto_ops vsock_stream_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_connect, .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = sock_no_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, .getsockopt = vsock_connectible_getsockopt, .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = vsock_set_rcvlowat, .read_skb = vsock_read_skb, }; static const struct proto_ops vsock_seqpacket_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_connect, .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = sock_no_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, .getsockopt = vsock_connectible_getsockopt, .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .read_skb = vsock_read_skb, }; static int vsock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct vsock_sock *vsk; struct sock *sk; int ret; if (!sock) return -EINVAL; if (protocol && protocol != PF_VSOCK) return -EPROTONOSUPPORT; switch (sock->type) { case SOCK_DGRAM: sock->ops = &vsock_dgram_ops; break; case SOCK_STREAM: sock->ops = &vsock_stream_ops; break; case SOCK_SEQPACKET: sock->ops = &vsock_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } sock->state = SS_UNCONNECTED; sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern); if (!sk) return -ENOMEM; vsk = vsock_sk(sk); if (sock->type == SOCK_DGRAM) { ret = vsock_assign_transport(vsk, NULL); if (ret < 0) { sock_put(sk); return ret; } } /* SOCK_DGRAM doesn't have 'setsockopt' callback set in its * proto_ops, so there is no handler for custom logic. */ if (sock_type_connectible(sock->type)) set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); vsock_insert_unbound(vsk); return 0; } static const struct net_proto_family vsock_family_ops = { .family = AF_VSOCK, .create = vsock_create, .owner = THIS_MODULE, }; static long vsock_dev_do_ioctl(struct file *filp, unsigned int cmd, void __user *ptr) { u32 __user *p = ptr; u32 cid = VMADDR_CID_ANY; int retval = 0; switch (cmd) { case IOCTL_VM_SOCKETS_GET_LOCAL_CID: /* To be compatible with the VMCI behavior, we prioritize the * guest CID instead of well-know host CID (VMADDR_CID_HOST). */ if (transport_g2h) cid = transport_g2h->get_local_cid(); else if (transport_h2g) cid = transport_h2g->get_local_cid(); if (put_user(cid, p) != 0) retval = -EFAULT; break; default: retval = -ENOIOCTLCMD; } return retval; } static long vsock_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); } #ifdef CONFIG_COMPAT static long vsock_dev_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); } #endif static const struct file_operations vsock_device_ops = { .owner = THIS_MODULE, .unlocked_ioctl = vsock_dev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vsock_dev_compat_ioctl, #endif .open = nonseekable_open, }; static struct miscdevice vsock_device = { .name = "vsock", .fops = &vsock_device_ops, }; static int __init vsock_init(void) { int err = 0; vsock_init_tables(); vsock_proto.owner = THIS_MODULE; vsock_device.minor = MISC_DYNAMIC_MINOR; err = misc_register(&vsock_device); if (err) { pr_err("Failed to register misc device\n"); goto err_reset_transport; } err = proto_register(&vsock_proto, 1); /* we want our slab */ if (err) { pr_err("Cannot register vsock protocol\n"); goto err_deregister_misc; } err = sock_register(&vsock_family_ops); if (err) { pr_err("could not register af_vsock (%d) address family: %d\n", AF_VSOCK, err); goto err_unregister_proto; } vsock_bpf_build_proto(); return 0; err_unregister_proto: proto_unregister(&vsock_proto); err_deregister_misc: misc_deregister(&vsock_device); err_reset_transport: return err; } static void __exit vsock_exit(void) { misc_deregister(&vsock_device); sock_unregister(AF_VSOCK); proto_unregister(&vsock_proto); } const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) { return vsk->transport; } EXPORT_SYMBOL_GPL(vsock_core_get_transport); int vsock_core_register(const struct vsock_transport *t, int features) { const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local; int err = mutex_lock_interruptible(&vsock_register_mutex); if (err) return err; t_h2g = transport_h2g; t_g2h = transport_g2h; t_dgram = transport_dgram; t_local = transport_local; if (features & VSOCK_TRANSPORT_F_H2G) { if (t_h2g) { err = -EBUSY; goto err_busy; } t_h2g = t; } if (features & VSOCK_TRANSPORT_F_G2H) { if (t_g2h) { err = -EBUSY; goto err_busy; } t_g2h = t; } if (features & VSOCK_TRANSPORT_F_DGRAM) { if (t_dgram) { err = -EBUSY; goto err_busy; } t_dgram = t; } if (features & VSOCK_TRANSPORT_F_LOCAL) { if (t_local) { err = -EBUSY; goto err_busy; } t_local = t; } transport_h2g = t_h2g; transport_g2h = t_g2h; transport_dgram = t_dgram; transport_local = t_local; err_busy: mutex_unlock(&vsock_register_mutex); return err; } EXPORT_SYMBOL_GPL(vsock_core_register); void vsock_core_unregister(const struct vsock_transport *t) { mutex_lock(&vsock_register_mutex); if (transport_h2g == t) transport_h2g = NULL; if (transport_g2h == t) transport_g2h = NULL; if (transport_dgram == t) transport_dgram = NULL; if (transport_local == t) transport_local = NULL; mutex_unlock(&vsock_register_mutex); } EXPORT_SYMBOL_GPL(vsock_core_unregister); module_init(vsock_init); module_exit(vsock_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Socket Family"); MODULE_VERSION("1.0.2.0-k"); MODULE_LICENSE("GPL v2"); |
43033 43021 140 14657 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 | // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/init.h> #include <linux/export.h> #include <linux/timer.h> #include <linux/acpi_pmtmr.h> #include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/clocksource.h> #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> #include <linux/static_call.h> #include <asm/hpet.h> #include <asm/timer.h> #include <asm/vgtod.h> #include <asm/time.h> #include <asm/delay.h> #include <asm/hypervisor.h> #include <asm/nmi.h> #include <asm/x86_init.h> #include <asm/geode.h> #include <asm/apic.h> #include <asm/cpu_device_id.h> #include <asm/i8259.h> #include <asm/uv/uv.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); #define KHZ 1000 /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; static unsigned int __initdata tsc_early_khz; static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc); int tsc_clocksource_reliable; static int __read_mostly tsc_force_recalibrate; static u32 art_to_tsc_numerator; static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; static bool have_art; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ seqcount_latch_t seq; /* 32 + 4 = 36 */ }; /* fits one cacheline */ static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); static int __init tsc_early_khz_setup(char *buf) { return kstrtouint(buf, 0, &tsc_early_khz); } early_param("tsc_early_khz", tsc_early_khz_setup); __always_inline void __cyc2ns_read(struct cyc2ns_data *data) { int seq, idx; do { seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) { preempt_disable_notrace(); __cyc2ns_read(data); } __always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); } /* * Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: * ns = cycles / (freq / ns_per_sec) * ns = cycles * (ns_per_sec / freq) * ns = cycles * (10^9 / (cpu_khz * 10^3)) * ns = cycles * (10^6 / cpu_khz) * * Then we use scaling math (suggested by george@mvista.com) to get: * ns = cycles * (10^6 * SC / cpu_khz) / SC * ns = cycles * cyc2ns_scale / SC * * And since SC is a constant power of two, we can convert the div * into a shift. The larger SC is, the more accurate the conversion, but * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication * (64-bit result) can be used. * * We can use khz divisor instead of mhz to keep a better precision. * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; __cyc2ns_read(&data); ns = data.cyc2ns_offset; ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); return ns; } static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) { unsigned long long ns; preempt_disable_notrace(); ns = __cycles_2_ns(cyc); preempt_enable_notrace(); return ns; } static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; ns_now = cycles_2_ns(tsc_now); /* * Compute a new multiplier as per the above comment and ensure our * time function is continuous; see the comment near struct * cyc2ns_data. */ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* * cyc2ns_shift is exported via arch_perf_update_userpage() where it is * not expected to be greater than 31 due to the original published * conversion algorithm shifting a 32-bit value (now specifies a 64-bit * value) - refer perf_event_mmap_page documentation in perf_event.h. */ if (data.cyc2ns_shift == 32) { data.cyc2ns_shift = 31; data.cyc2ns_mul >>= 1; } data.cyc2ns_offset = ns_now - mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift); c2n = per_cpu_ptr(&cyc2ns, cpu); raw_write_seqcount_latch(&c2n->seq); c2n->data[0] = data; raw_write_seqcount_latch(&c2n->seq); c2n->data[1] = data; } static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long flags; local_irq_save(flags); sched_clock_idle_sleep_event(); if (khz) __set_cyc2ns_scale(khz, cpu, tsc_now); sched_clock_idle_wakeup_event(); local_irq_restore(flags); } /* * Initialize cyc2ns for boot cpu */ static void __init cyc2ns_init_boot_cpu(void) { struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); seqcount_latch_init(&c2n->seq); __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); } /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same * speed as the bootup CPU. */ static void __init cyc2ns_init_secondary_cpus(void) { unsigned int cpu, this_cpu = smp_processor_id(); struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); struct cyc2ns_data *data = c2n->data; for_each_possible_cpu(cpu) { if (cpu != this_cpu) { seqcount_latch_init(&c2n->seq); c2n = per_cpu_ptr(&cyc2ns, cpu); c2n->data[0] = data[0]; c2n->data[1] = data[1]; } } } /* * Scheduler clock - returns current time in nanosec units. */ noinstr u64 native_sched_clock(void) { if (static_branch_likely(&__use_tsc)) { u64 tsc_now = rdtsc(); /* return the value in ns */ return __cycles_2_ns(tsc_now); } /* * Fall back to jiffies if there's no TSC available: * ( But note that we still use it if the TSC is marked * unstable. We do this because unlike Time Of Day, * the scheduler clock tolerates small errors and it's * very important for it to be as fast as the platform * can achieve it. ) */ /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); } /* * Generate a sched_clock if you already have a TSC value. */ u64 native_sched_clock_from_tsc(u64 tsc) { return cycles_2_ns(tsc); } /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT noinstr u64 sched_clock_noinstr(void) { return paravirt_sched_clock(); } bool using_native_sched_clock(void) { return static_call_query(pv_sched_clock) == native_sched_clock; } #else u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif notrace u64 sched_clock(void) { u64 now; preempt_disable_notrace(); now = sched_clock_noinstr(); preempt_enable_notrace(); return now; } int check_tsc_unstable(void) { return tsc_unstable; } EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { mark_tsc_unstable("boot parameter notsc"); return 1; } #else /* * disable flag for tsc. Takes effect by clearing the TSC cpu flag * in cpu/common.c */ int __init notsc_setup(char *str) { setup_clear_cpu_cap(X86_FEATURE_TSC); return 1; } #endif __setup("notsc", notsc_setup); static int no_sched_irq_time; static int no_tsc_watchdog; static int tsc_as_watchdog; static int __init tsc_setup(char *str) { if (!strcmp(str, "reliable")) tsc_clocksource_reliable = 1; if (!strncmp(str, "noirqtime", 9)) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); if (!strcmp(str, "nowatchdog")) { no_tsc_watchdog = 1; if (tsc_as_watchdog) pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n", __func__); tsc_as_watchdog = 0; } if (!strcmp(str, "recalibrate")) tsc_force_recalibrate = 1; if (!strcmp(str, "watchdog")) { if (no_tsc_watchdog) pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n", __func__); else tsc_as_watchdog = 1; } return 1; } __setup("tsc=", tsc_setup); #define MAX_RETRIES 5 #define TSC_DEFAULT_THRESHOLD 0x20000 /* * Read TSC and the reference counters. Take care of any disturbances */ static u64 tsc_read_refs(u64 *p, int hpet) { u64 t1, t2; u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD; int i; for (i = 0; i < MAX_RETRIES; i++) { t1 = get_cycles(); if (hpet) *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; else *p = acpi_pm_read_early(); t2 = get_cycles(); if ((t2 - t1) < thresh) return t2; } return ULLONG_MAX; } /* * Calculate the TSC frequency from HPET reference */ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) { u64 tmp; if (hpet2 < hpet1) hpet2 += 0x100000000ULL; hpet2 -= hpet1; tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); do_div(tmp, 1000000); deltatsc = div64_u64(deltatsc, tmp); return (unsigned long) deltatsc; } /* * Calculate the TSC frequency from PMTimer reference */ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) { u64 tmp; if (!pm1 && !pm2) return ULONG_MAX; if (pm2 < pm1) pm2 += (u64)ACPI_PM_OVRRUN; pm2 -= pm1; tmp = pm2 * 1000000000LL; do_div(tmp, PMTMR_TICKS_PER_SEC); do_div(deltatsc, tmp); return (unsigned long) deltatsc; } #define CAL_MS 10 #define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) #define CAL_PIT_LOOPS 1000 #define CAL2_MS 50 #define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) #define CAL2_PIT_LOOPS 5000 /* * Try to calibrate the TSC against the Programmable * Interrupt Timer and return the frequency of the TSC * in kHz. * * Return ULONG_MAX on failure to calibrate. */ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) { u64 tsc, t1, t2, delta; unsigned long tscmin, tscmax; int pitcnt; if (!has_legacy_pic()) { /* * Relies on tsc_early_delay_calibrate() to have given us semi * usable udelay(), wait for the same 50ms we would have with * the PIT loop below. */ udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); return ULONG_MAX; } /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); /* * Setup CTC channel 2* for mode 0, (interrupt on terminal * count mode), binary count. Set the latch register to 50ms * (LSB then MSB) to begin countdown. */ outb(0xb0, 0x43); outb(latch & 0xff, 0x42); outb(latch >> 8, 0x42); tsc = t1 = t2 = get_cycles(); pitcnt = 0; tscmax = 0; tscmin = ULONG_MAX; while ((inb(0x61) & 0x20) == 0) { t2 = get_cycles(); delta = t2 - tsc; tsc = t2; if ((unsigned long) delta < tscmin) tscmin = (unsigned int) delta; if ((unsigned long) delta > tscmax) tscmax = (unsigned int) delta; pitcnt++; } /* * Sanity checks: * * If we were not able to read the PIT more than loopmin * times, then we have been hit by a massive SMI * * If the maximum is 10 times larger than the minimum, * then we got hit by an SMI as well. */ if (pitcnt < loopmin || tscmax > 10 * tscmin) return ULONG_MAX; /* Calculate the PIT value */ delta = t2 - t1; do_div(delta, ms); return delta; } /* * This reads the current MSB of the PIT counter, and * checks if we are running on sufficiently fast and * non-virtualized hardware. * * Our expectations are: * * - the PIT is running at roughly 1.19MHz * * - each IO is going to take about 1us on real hardware, * but we allow it to be much faster (by a factor of 10) or * _slightly_ slower (ie we allow up to a 2us read+counter * update - anything else implies a unacceptably slow CPU * or PIT for the fast calibration to work. * * - with 256 PIT ticks to read the value, we have 214us to * see the same MSB (and overhead like doing a single TSC * read per MSB value etc). * * - We're doing 2 reads per loop (LSB, MSB), and we expect * them each to take about a microsecond on real hardware. * So we expect a count value of around 100. But we'll be * generous, and accept anything over 50. * * - if the PIT is stuck, and we see *many* more reads, we * return early (and the next caller of pit_expect_msb() * then consider it a failure when they don't see the * next expected value). * * These expectations mean that we know that we have seen the * transition from one expected value to another with a fairly * high accuracy, and we didn't miss any events. We can thus * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequency. */ static inline int pit_verify_msb(unsigned char val) { /* Ignore LSB */ inb(0x42); return inb(0x42) == val; } static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; u64 tsc = 0, prev_tsc = 0; for (count = 0; count < 50000; count++) { if (!pit_verify_msb(val)) break; prev_tsc = tsc; tsc = get_cycles(); } *deltap = get_cycles() - prev_tsc; *tscp = tsc; /* * We require _some_ success, but the quality control * will be based on the error terms on the TSC values. */ return count > 5; } /* * How many MSB values do we want to see? We aim for * a maximum error rate of 500ppm (in practice the * real error is much smaller), but refuse to spend * more than 50ms on it. */ #define MAX_QUICK_PIT_MS 50 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) static unsigned long quick_pit_calibrate(void) { int i; u64 tsc, delta; unsigned long d1, d2; if (!has_legacy_pic()) return 0; /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); /* * Counter 2, mode 0 (one-shot), binary count * * NOTE! Mode 2 decrements by two (and then the * output is flipped each time, giving the same * final output frequency as a decrement-by-one), * so mode 0 is much better when looking at the * individual counts. */ outb(0xb0, 0x43); /* Start at 0xffff */ outb(0xff, 0x42); outb(0xff, 0x42); /* * The PIT starts counting at the next edge, so we * need to delay for a microsecond. The easiest way * to do that is to just read back the 16-bit counter * once from the PIT. */ pit_verify_msb(0); if (pit_expect_msb(0xff, &tsc, &d1)) { for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { if (!pit_expect_msb(0xff-i, &delta, &d2)) break; delta -= tsc; /* * Extrapolate the error and fail fast if the error will * never be below 500 ppm. */ if (i == 1 && d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11) return 0; /* * Iterate until the error is less than 500 ppm */ if (d1+d2 >= delta >> 11) continue; /* * Check the PIT one more time to verify that * all TSC reads were stable wrt the PIT. * * This also guarantees serialization of the * last cycle read ('d2') in pit_expect_msb. */ if (!pit_verify_msb(0xfe - i)) break; goto success; } } pr_info("Fast TSC calibration failed\n"); return 0; success: /* * Ok, if we get here, then we've seen the * MSB of the PIT decrement 'i' times, and the * error has shrunk to less than 500 ppm. * * As a result, we can depend on there not being * any odd delays anywhere, and the TSC reads are * reliable (within the error). * * kHz = ticks / time-in-seconds / 1000; * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) */ delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); pr_info("Fast TSC calibration using PIT\n"); return delta; } /** * native_calibrate_tsc - determine TSC frequency * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) { unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; unsigned int crystal_khz; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; if (boot_cpu_data.cpuid_level < 0x15) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; crystal_khz = ecx_hz / 1000; /* * Denverton SoCs don't report crystal clock, and also don't support * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal * clock. */ if (crystal_khz == 0 && boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D) crystal_khz = 25000; /* * TSC frequency reported directly by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This * is considered a known frequency. */ if (crystal_khz != 0) setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); /* * Some Intel SoCs like Skylake and Kabylake don't report the crystal * clock, but we can easily calculate it to a high degree of accuracy * by considering the crystal ratio and the CPU speed. */ if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) { unsigned int eax_base_mhz, ebx, ecx, edx; cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx); crystal_khz = eax_base_mhz * 1000 * eax_denominator / ebx_numerator; } if (crystal_khz == 0) return 0; /* * For Atom SoCs TSC is the only reliable clocksource. * Mark TSC reliable so no watchdog on it. */ if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); #ifdef CONFIG_X86_LOCAL_APIC /* * The local APIC appears to be fed by the core crystal clock * (which sounds entirely sensible). We can set the global * lapic_timer_period here to avoid having to calibrate the APIC * timer later. */ lapic_timer_period = crystal_khz * 1000 / HZ; #endif return crystal_khz * ebx_numerator / eax_denominator; } static unsigned long cpu_khz_from_cpuid(void) { unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; if (boot_cpu_data.cpuid_level < 0x16) return 0; eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); return eax_base_mhz * 1000; } /* * calibrate cpu using pit, hpet, and ptimer methods. They are available * later in boot after acpi is initialized. */ static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes * here: * * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and * load a timeout of 50ms. We read the time right after we * started the timer and wait until the PIT count down reaches * zero. In each wait loop iteration we read the TSC and check * the delta to the previous read. We keep track of the min * and max values of that delta. The delta is mostly defined * by the IO time of the PIT access, so we can detect when * any disturbance happened between the two reads. If the * maximum time is significantly larger than the minimum time, * then we discard the result and have another try. * * 2) Reference counter. If available we use the HPET or the * PMTIMER as a reference to check the sanity of that value. * We use separate TSC readouts and check inside of the * reference read for any possible disturbance. We discard * disturbed values here as well. We do that around the PIT * calibration delay loop as we have to wait for a certain * amount of time anyway. */ /* Preset PIT loop values */ latch = CAL_LATCH; ms = CAL_MS; loopmin = CAL_PIT_LOOPS; for (i = 0; i < 3; i++) { unsigned long tsc_pit_khz; /* * Read the start value and the reference count of * hpet/pmtimer when available. Then do the PIT * calibration, which will take at least 50ms, and * read the end value. */ local_irq_save(flags); tsc1 = tsc_read_refs(&ref1, hpet); tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); tsc2 = tsc_read_refs(&ref2, hpet); local_irq_restore(flags); /* Pick the lowest PIT TSC calibration so far */ tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); /* hpet or pmtimer available ? */ if (ref1 == ref2) continue; /* Check, whether the sampling was disturbed */ if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) continue; tsc2 = (tsc2 - tsc1) * 1000000LL; if (hpet) tsc2 = calc_hpet_ref(tsc2, ref1, ref2); else tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); /* Check the reference deviation */ delta = ((u64) tsc_pit_min) * 100; do_div(delta, tsc_ref_min); /* * If both calibration results are inside a 10% window * then we can be sure, that the calibration * succeeded. We break out of the loop right away. We * use the reference value, as it is more precise. */ if (delta >= 90 && delta <= 110) { pr_info("PIT calibration matches %s. %d loops\n", hpet ? "HPET" : "PMTIMER", i + 1); return tsc_ref_min; } /* * Check whether PIT failed more than once. This * happens in virtualized environments. We need to * give the virtual PC a slightly longer timeframe for * the HPET/PMTIMER to make the result precise. */ if (i == 1 && tsc_pit_min == ULONG_MAX) { latch = CAL2_LATCH; ms = CAL2_MS; loopmin = CAL2_PIT_LOOPS; } } /* * Now check the results. */ if (tsc_pit_min == ULONG_MAX) { /* PIT gave no useful value */ pr_warn("Unable to calibrate against PIT\n"); /* We don't have an alternative source, disable TSC */ if (!hpet && !ref1 && !ref2) { pr_notice("No reference (HPET/PMTIMER) available\n"); return 0; } /* The alternative source failed as well, disable TSC */ if (tsc_ref_min == ULONG_MAX) { pr_warn("HPET/PMTIMER calibration failed\n"); return 0; } /* Use the alternative source */ pr_info("using %s reference calibration\n", hpet ? "HPET" : "PMTIMER"); return tsc_ref_min; } /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !ref1 && !ref2) { pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /* The alternative source failed, use the PIT calibration value */ if (tsc_ref_min == ULONG_MAX) { pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); return tsc_pit_min; } /* * The calibration values differ too much. In doubt, we use * the PIT value as we know that there are PMTIMERs around * running at double speed. At least we let the user know: */ pr_warn("PIT calibration deviates from %s: %lu %lu\n", hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /** * native_calibrate_cpu_early - can calibrate the cpu early in boot */ unsigned long native_calibrate_cpu_early(void) { unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); if (!fast_calibrate) fast_calibrate = cpu_khz_from_msr(); if (!fast_calibrate) { local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); } return fast_calibrate; } /** * native_calibrate_cpu - calibrate the cpu */ static unsigned long native_calibrate_cpu(void) { unsigned long tsc_freq = native_calibrate_cpu_early(); if (!tsc_freq) tsc_freq = pit_hpet_ptimer_calibrate_cpu(); return tsc_freq; } void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; if (!boot_cpu_has(X86_FEATURE_TSC)) return; cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); #endif } EXPORT_SYMBOL_GPL(recalibrate_cpu_khz); static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { if (!sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); } /* * Even on processors with invariant TSC, TSC gets reset in some the * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to * arbitrary value (still sync'd across cpu's) during resume from such sleep * states. To cope up with this, recompute the cyc2ns_offset for each cpu so * that sched_clock() continues from the point where it was left off during * suspend. */ void tsc_restore_sched_clock_state(void) { unsigned long long offset; unsigned long flags; int cpu; if (!sched_clock_stable()) return; local_irq_save(flags); /* * We're coming out of suspend, there's no concurrency yet; don't * bother being nice about the RCU stuff, just write to both * data fields. */ this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); offset = cyc2ns_suspend - sched_clock(); for_each_possible_cpu(cpu) { per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; } local_irq_restore(flags); } #ifdef CONFIG_CPU_FREQ /* * Frequency scaling support. Adjust the TSC based timer when the CPU frequency * changes. * * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC * as unstable and give up in those cases. * * Should fix up last_tsc too. Currently gettimeofday in the * first tick after the change will be slightly wrong. */ static unsigned int ref_freq; static unsigned long loops_per_jiffy_ref; static unsigned long tsc_khz_ref; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; if (num_online_cpus() > 1) { mark_tsc_unstable("cpufreq changes on SMP"); return 0; } if (!ref_freq) { ref_freq = freq->old; loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy; tsc_khz_ref = tsc_khz; } if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { boot_cpu_data.loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc()); } return 0; } static struct notifier_block time_cpufreq_notifier_block = { .notifier_call = time_cpufreq_notifier }; static int __init cpufreq_register_tsc_scaling(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); return 0; } core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ #define ART_CPUID_LEAF (0x15) #define ART_MIN_DENOMINATOR (1) /* * If ART is present detect the numerator:denominator to convert to TSC */ static void __init detect_art(void) { unsigned int unused[2]; if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; /* * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required, * and the TSC counter resets must not occur asynchronously. */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || !boot_cpu_has(X86_FEATURE_TSC_ADJUST) || tsc_async_resets) return; cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, &art_to_tsc_numerator, unused, unused+1); if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) return; rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); } /* clocksource code */ static void tsc_resume(struct clocksource *cs) { tsc_verify_tsc_adjust(true); } /* * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which * is smaller than the cycle_last reference value due to a TSC which * is slightly behind. This delta is nowhere else observable, but in * that case it results in a forward time jump in the range of hours * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm * timer. * * This sanity check is now done in the core timekeeping code. * checking the result of read_tsc() - cycle_last for being negative. * That works because CLOCKSOURCE_MASK(64) does not mask out any bit. */ static u64 read_tsc(struct clocksource *cs) { return (u64)rdtsc_ordered(); } static void tsc_cs_mark_unstable(struct clocksource *cs) { if (tsc_unstable) return; tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to clocksource watchdog\n"); } static void tsc_cs_tick_stable(struct clocksource *cs) { if (tsc_unstable) return; if (using_native_sched_clock()) sched_clock_tick_stable(); } static int tsc_cs_enable(struct clocksource *cs) { vclocks_set_used(VDSO_CLOCKMODE_TSC); return 0; } /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ static struct clocksource clocksource_tsc_early = { .name = "tsc-early", .rating = 299, .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, .id = CSID_X86_TSC_EARLY, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, .list = LIST_HEAD_INIT(clocksource_tsc_early.list), }; /* * Must mark VALID_FOR_HRES early such that when we unregister tsc_early * this one will immediately take over. We will only register if TSC has * been found good. */ static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY | CLOCK_SOURCE_VERIFY_PERCPU, .id = CSID_X86_TSC, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, .list = LIST_HEAD_INIT(clocksource_tsc.list), }; void mark_tsc_unstable(char *reason) { if (tsc_unstable) return; tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); clocksource_mark_unstable(&clocksource_tsc_early); clocksource_mark_unstable(&clocksource_tsc); } EXPORT_SYMBOL_GPL(mark_tsc_unstable); static void __init tsc_disable_clocksource_watchdog(void) { clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } bool tsc_clocksource_watchdog_disabled(void) { return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) && tsc_as_watchdog && !no_tsc_watchdog; } static void __init check_system_tsc_reliable(void) { #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) if (is_geode_lx()) { /* RTSC counts during suspend */ #define RTSC_SUSP 0x100 unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); /* Geode_LX - the OLPC CPU has a very reliable TSC */ if (res_low & RTSC_SUSP) tsc_clocksource_reliable = 1; } #endif if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) tsc_clocksource_reliable = 1; /* * Disable the clocksource watchdog when the system has: * - TSC running at constant frequency * - TSC which does not stop in C-States * - the TSC_ADJUST register which allows to detect even minimal * modifications * - not more than two sockets. As the number of sockets cannot be * evaluated at the early boot stage where this has to be * invoked, check the number of online memory nodes as a * fallback solution which is an reasonable estimate. */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && boot_cpu_has(X86_FEATURE_TSC_ADJUST) && nr_online_nodes <= 4) tsc_disable_clocksource_watchdog(); } /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. */ int unsynchronized_tsc(void) { if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable) return 1; #ifdef CONFIG_SMP if (apic_is_clustered_box()) return 1; #endif if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; if (tsc_clocksource_reliable) return 0; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ if (num_possible_cpus() > 1) return 1; } return 0; } /* * Convert ART to TSC given numerator/denominator found in detect_art() */ struct system_counterval_t convert_art_to_tsc(u64 art) { u64 tmp, res, rem; rem = do_div(art, art_to_tsc_denominator); res = art * art_to_tsc_numerator; tmp = rem * art_to_tsc_numerator; do_div(tmp, art_to_tsc_denominator); res += tmp + art_to_tsc_offset; return (struct system_counterval_t) { .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC, .cycles = res, }; } EXPORT_SYMBOL(convert_art_to_tsc); /** * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC. * @art_ns: ART (Always Running Timer) in unit of nanoseconds * * PTM requires all timestamps to be in units of nanoseconds. When user * software requests a cross-timestamp, this function converts system timestamp * to TSC. * * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check * that this flag is set before conversion to TSC is attempted. * * Return: * struct system_counterval_t - system counter value with the ID of the * corresponding clocksource: * cycles: System counter value * cs_id: The clocksource ID for validating comparability */ struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns) { u64 tmp, res, rem; rem = do_div(art_ns, USEC_PER_SEC); res = art_ns * tsc_khz; tmp = rem * tsc_khz; do_div(tmp, USEC_PER_SEC); res += tmp; return (struct system_counterval_t) { .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC, .cycles = res, }; } EXPORT_SYMBOL(convert_art_ns_to_tsc); static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** * tsc_refine_calibration_work - Further refine tsc freq calibration * @work: ignored. * * This functions uses delayed work over a period of a * second to further refine the TSC freq value. Since this is * timer based, instead of loop based, we don't block the boot * process while this longer calibration is done. * * If there are any calibration anomalies (too many SMIs, etc), * or the refined calibration is off by 1% of the fast early * calibration, we throw out the new calibration and use the * early calibration. */ static void tsc_refine_calibration_work(struct work_struct *work) { static u64 tsc_start = ULLONG_MAX, ref_start; static int hpet; u64 tsc_stop, ref_stop, delta; unsigned long freq; int cpu; /* Don't bother refining TSC on unstable systems */ if (tsc_unstable) goto unreg; /* * Since the work is started early in boot, we may be * delayed the first time we expire. So set the workqueue * again once we know timers are working. */ if (tsc_start == ULLONG_MAX) { restart: /* * Only set hpet once, to avoid mixing hardware * if the hpet becomes enabled later. */ hpet = is_hpet_enabled(); tsc_start = tsc_read_refs(&ref_start, hpet); schedule_delayed_work(&tsc_irqwork, HZ); return; } tsc_stop = tsc_read_refs(&ref_stop, hpet); /* hpet or pmtimer available ? */ if (ref_start == ref_stop) goto out; /* Check, whether the sampling was disturbed */ if (tsc_stop == ULLONG_MAX) goto restart; delta = tsc_stop - tsc_start; delta *= 1000000LL; if (hpet) freq = calc_hpet_ref(delta, ref_start, ref_stop); else freq = calc_pmtimer_ref(delta, ref_start, ref_stop); /* Will hit this only if tsc_force_recalibrate has been set */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { /* Warn if the deviation exceeds 500 ppm */ if (abs(tsc_khz - freq) > (tsc_khz >> 11)) { pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n"); pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n", (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); } pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n", hpet ? "HPET" : "PM_TIMER", (unsigned long)freq / 1000, (unsigned long)freq % 1000); return; } /* Make sure we're within 1% */ if (abs(tsc_khz - freq) > tsc_khz/100) goto out; tsc_khz = freq; pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); /* Inform the TSC deadline clockevent devices about the recalibration */ lapic_update_tsc_freq(); /* Update the sched_clock() rate to match the clocksource one */ for_each_possible_cpu(cpu) set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); out: if (tsc_unstable) goto unreg; if (boot_cpu_has(X86_FEATURE_ART)) have_art = true; clocksource_register_khz(&clocksource_tsc, tsc_khz); unreg: clocksource_unregister(&clocksource_tsc_early); } static int __init init_tsc_clocksource(void) { if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; if (tsc_unstable) { clocksource_unregister(&clocksource_tsc_early); return 0; } if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; /* * When TSC frequency is known (retrieved via MSR or CPUID), we skip * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { if (boot_cpu_has(X86_FEATURE_ART)) have_art = true; clocksource_register_khz(&clocksource_tsc, tsc_khz); clocksource_unregister(&clocksource_tsc_early); if (!tsc_force_recalibrate) return 0; } schedule_delayed_work(&tsc_irqwork, 0); return 0; } /* * We use device_initcall here, to ensure we run after the hpet * is fully initialized, which may occur at fs_initcall time. */ device_initcall(init_tsc_clocksource); static bool __init determine_cpu_tsc_frequencies(bool early) { /* Make sure that cpu and tsc are not already calibrated */ WARN_ON(cpu_khz || tsc_khz); if (early) { cpu_khz = x86_platform.calibrate_cpu(); if (tsc_early_khz) tsc_khz = tsc_early_khz; else tsc_khz = x86_platform.calibrate_tsc(); } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); cpu_khz = pit_hpet_ptimer_calibrate_cpu(); } /* * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; if (tsc_khz == 0) return false; pr_info("Detected %lu.%03lu MHz processor\n", (unsigned long)cpu_khz / KHZ, (unsigned long)cpu_khz % KHZ); if (cpu_khz != tsc_khz) { pr_info("Detected %lu.%03lu MHz TSC", (unsigned long)tsc_khz / KHZ, (unsigned long)tsc_khz % KHZ); } return true; } static unsigned long __init get_loops_per_jiffy(void) { u64 lpj = (u64)tsc_khz * KHZ; do_div(lpj, HZ); return lpj; } static void __init tsc_enable_sched_clock(void) { loops_per_jiffy = get_loops_per_jiffy(); use_tsc_delay(); /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); cyc2ns_init_boot_cpu(); static_branch_enable(&__use_tsc); } void __init tsc_early_init(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return; /* Don't change UV TSC multi-chassis synchronization */ if (is_early_uv_system()) return; if (!determine_cpu_tsc_frequencies(true)) return; tsc_enable_sched_clock(); } void __init tsc_init(void) { if (!cpu_feature_enabled(X86_FEATURE_TSC)) { setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } /* * native_calibrate_cpu_early can only calibrate using methods that are * available early in boot. */ if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) x86_platform.calibrate_cpu = native_calibrate_cpu; if (!tsc_khz) { /* We failed to determine frequencies earlier, try again */ if (!determine_cpu_tsc_frequencies(false)) { mark_tsc_unstable("could not calculate TSC khz"); setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } tsc_enable_sched_clock(); } cyc2ns_init_secondary_cpus(); if (!no_sched_irq_time) enable_sched_clock_irqtime(); lpj_fine = get_loops_per_jiffy(); check_system_tsc_reliable(); if (unsynchronized_tsc()) { mark_tsc_unstable("TSCs unsynchronized"); return; } if (tsc_clocksource_reliable || no_tsc_watchdog) tsc_disable_clocksource_watchdog(); clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } #ifdef CONFIG_SMP /* * Check whether existing calibration data can be reused. */ unsigned long calibrate_delay_is_known(void) { int sibling, cpu = smp_processor_id(); int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); /* * If TSC has constant frequency and TSC is synchronized across * sockets then reuse CPU0 calibration. */ if (constant_tsc && !tsc_unstable) return cpu_data(0).loops_per_jiffy; /* * If TSC has constant frequency and TSC is not synchronized across * sockets and this is not the first CPU in the socket, then reuse * the calibration value of an already online CPU on that socket. * * This assumes that CONSTANT_TSC is consistent for all CPUs in a * socket. */ if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); if (sibling < nr_cpu_ids) return cpu_data(sibling).loops_per_jiffy; return 0; } #endif |
6 2 5 4 3 1 6 2 1 1 1 1 1 4 3 1 1 1 2 13 1 7 4 1 170 171 171 1 22 2 1 19 1 19 25 1 22 20 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1994 Linus Torvalds * * Cyrix stuff, June 1998 by: * - Rafael R. Reilova (moved everything from head.S), * <rreilova@ececs.uc.edu> * - Channing Corn (tests & fixes), * - Andrew D. Balsa (code cleanup). */ #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/nospec.h> #include <linux/prctl.h> #include <linux/sched/smt.h> #include <linux/pgtable.h> #include <linux/bpf.h> #include <asm/spec-ctrl.h> #include <asm/cmdline.h> #include <asm/bugs.h> #include <asm/processor.h> #include <asm/processor-flags.h> #include <asm/fpu/api.h> #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> #include <asm/cpu_device_id.h> #include <asm/e820/api.h> #include <asm/hypervisor.h> #include <asm/tlbflush.h> #include <asm/cpu.h> #include "cpu.h" static void __init spectre_v1_select_mitigation(void); static void __init spectre_v2_select_mitigation(void); static void __init retbleed_select_mitigation(void); static void __init spectre_v2_user_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); static void __init md_clear_update_mitigation(void); static void __init md_clear_select_mitigation(void); static void __init taa_select_mitigation(void); static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); static void __init srso_select_mitigation(void); static void __init gds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); /* The current value of the SPEC_CTRL MSR with task-specific bits set */ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); static u64 __ro_after_init x86_arch_cap_msr; static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) { this_cpu_write(x86_spec_ctrl_current, val); wrmsrl(MSR_IA32_SPEC_CTRL, val); } /* * Keep track of the SPEC_CTRL MSR value for the current task, which may differ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). */ void update_spec_ctrl_cond(u64 val) { if (this_cpu_read(x86_spec_ctrl_current) == val) return; this_cpu_write(x86_spec_ctrl_current, val); /* * When KERNEL_IBRS this MSR is written on return-to-user, unless * forced the update can be delayed until that time. */ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) wrmsrl(MSR_IA32_SPEC_CTRL, val); } noinstr u64 spec_ctrl_current(void) { return this_cpu_read(x86_spec_ctrl_current); } EXPORT_SYMBOL_GPL(spec_ctrl_current); /* * AMD specific MSR info for Speculative Store Bypass control. * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu(). */ u64 __ro_after_init x86_amd_ls_cfg_base; u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; /* Control conditional STIBP in switch_to() */ DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); /* Control conditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); /* Control MDS CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); /* * Controls whether l1d flush based mitigations are enabled, * based on hw features and admin setting via boot parameter * defaults to false */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); /* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); EXPORT_SYMBOL_GPL(mmio_stale_data_clear); void __init cpu_select_mitigations(void) { /* * Read the SPEC_CTRL MSR to account for reserved bits which may * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD * init code as it is not enumerated and depends on the family. */ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); /* * Previously running kernel (kexec), may have some controls * turned ON. Clear them and let the mitigations setup below * rediscover them based on configuration. */ x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; } x86_arch_cap_msr = x86_read_arch_cap_msr(); /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); /* * retbleed_select_mitigation() relies on the state set by * spectre_v2_select_mitigation(); specifically it wants to know about * spectre_v2=ibrs. */ retbleed_select_mitigation(); /* * spectre_v2_user_select_mitigation() relies on the state set by * retbleed_select_mitigation(); specifically the STIBP selection is * forced for UNRET or IBPB. */ spectre_v2_user_select_mitigation(); ssb_select_mitigation(); l1tf_select_mitigation(); md_clear_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); /* * srso_select_mitigation() depends and must run after * retbleed_select_mitigation(). */ srso_select_mitigation(); gds_select_mitigation(); } /* * NOTE: This function is *only* called for SVM, since Intel uses * MSR_IA32_SPEC_CTRL for SSBD. */ void x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest) { u64 guestval, hostval; struct thread_info *ti = current_thread_info(); /* * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. */ if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) && !static_cpu_has(X86_FEATURE_VIRT_SSBD)) return; /* * If the host has SSBD mitigation enabled, force it in the host's * virtual MSR value. If its not permanently enabled, evaluate * current's TIF_SSBD thread flag. */ if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE)) hostval = SPEC_CTRL_SSBD; else hostval = ssbd_tif_to_spec_ctrl(ti->flags); /* Sanitize the guest value */ guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD; if (hostval != guestval) { unsigned long tif; tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : ssbd_spec_ctrl_to_tif(hostval); speculation_ctrl_update(tif); } } EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); static void x86_amd_ssb_disable(void) { u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) wrmsrl(MSR_AMD64_LS_CFG, msrval); } #undef pr_fmt #define pr_fmt(fmt) "MDS: " fmt /* Default mitigation for MDS-affected CPUs */ static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; static bool mds_nosmt __ro_after_init = false; static const char * const mds_strings[] = { [MDS_MITIGATION_OFF] = "Vulnerable", [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers", [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", }; static void __init mds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { mds_mitigation = MDS_MITIGATION_OFF; return; } if (mds_mitigation == MDS_MITIGATION_FULL) { if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); } } static int __init mds_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_MDS)) return 0; if (!str) return -EINVAL; if (!strcmp(str, "off")) mds_mitigation = MDS_MITIGATION_OFF; else if (!strcmp(str, "full")) mds_mitigation = MDS_MITIGATION_FULL; else if (!strcmp(str, "full,nosmt")) { mds_mitigation = MDS_MITIGATION_FULL; mds_nosmt = true; } return 0; } early_param("mds", mds_cmdline); #undef pr_fmt #define pr_fmt(fmt) "TAA: " fmt enum taa_mitigations { TAA_MITIGATION_OFF, TAA_MITIGATION_UCODE_NEEDED, TAA_MITIGATION_VERW, TAA_MITIGATION_TSX_DISABLED, }; /* Default mitigation for TAA-affected CPUs */ static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; static bool taa_nosmt __ro_after_init; static const char * const taa_strings[] = { [TAA_MITIGATION_OFF] = "Vulnerable", [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", }; static void __init taa_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_TAA)) { taa_mitigation = TAA_MITIGATION_OFF; return; } /* TSX previously disabled by tsx=off */ if (!boot_cpu_has(X86_FEATURE_RTM)) { taa_mitigation = TAA_MITIGATION_TSX_DISABLED; return; } if (cpu_mitigations_off()) { taa_mitigation = TAA_MITIGATION_OFF; return; } /* * TAA mitigation via VERW is turned off if both * tsx_async_abort=off and mds=off are specified. */ if (taa_mitigation == TAA_MITIGATION_OFF && mds_mitigation == MDS_MITIGATION_OFF) return; if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) taa_mitigation = TAA_MITIGATION_VERW; else taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; /* * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. * A microcode update fixes this behavior to clear CPU buffers. It also * adds support for MSR_IA32_TSX_CTRL which is enumerated by the * ARCH_CAP_TSX_CTRL_MSR bit. * * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode * update is required. */ if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; /* * TSX is enabled, select alternate mitigation for TAA which is * the same as MDS. Enable MDS static branch to clear CPU buffers. * * For guests that can't determine whether the correct microcode is * present on host, enable the mitigation for UCODE_NEEDED as well. */ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); } static int __init tsx_async_abort_parse_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_TAA)) return 0; if (!str) return -EINVAL; if (!strcmp(str, "off")) { taa_mitigation = TAA_MITIGATION_OFF; } else if (!strcmp(str, "full")) { taa_mitigation = TAA_MITIGATION_VERW; } else if (!strcmp(str, "full,nosmt")) { taa_mitigation = TAA_MITIGATION_VERW; taa_nosmt = true; } return 0; } early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "MMIO Stale Data: " fmt enum mmio_mitigations { MMIO_MITIGATION_OFF, MMIO_MITIGATION_UCODE_NEEDED, MMIO_MITIGATION_VERW, }; /* Default mitigation for Processor MMIO Stale Data vulnerabilities */ static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; static bool mmio_nosmt __ro_after_init = false; static const char * const mmio_strings[] = { [MMIO_MITIGATION_OFF] = "Vulnerable", [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", }; static void __init mmio_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || cpu_mitigations_off()) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } if (mmio_mitigation == MMIO_MITIGATION_OFF) return; /* * Enable CPU buffer clear mitigation for host and VMM, if also affected * by MDS or TAA. Otherwise, enable mitigation for VMM only. */ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM))) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); /* * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based * mitigations, disable KVM-only mitigation in that case. */ if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) static_branch_disable(&mmio_stale_data_clear); else static_branch_enable(&mmio_stale_data_clear); /* * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can * be propagated to uncore buffers, clearing the Fill buffers on idle * is required irrespective of SMT state. */ if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); /* * Check if the system has the right microcode. * * CPU Fill buffer clear mitigation is enumerated by either an explicit * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS * affected systems. */ if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || (boot_cpu_has(X86_FEATURE_MD_CLEAR) && boot_cpu_has(X86_FEATURE_FLUSH_L1D) && !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))) mmio_mitigation = MMIO_MITIGATION_VERW; else mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; if (mmio_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); } static int __init mmio_stale_data_parse_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) return 0; if (!str) return -EINVAL; if (!strcmp(str, "off")) { mmio_mitigation = MMIO_MITIGATION_OFF; } else if (!strcmp(str, "full")) { mmio_mitigation = MMIO_MITIGATION_VERW; } else if (!strcmp(str, "full,nosmt")) { mmio_mitigation = MMIO_MITIGATION_VERW; mmio_nosmt = true; } return 0; } early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "Register File Data Sampling: " fmt enum rfds_mitigations { RFDS_MITIGATION_OFF, RFDS_MITIGATION_VERW, RFDS_MITIGATION_UCODE_NEEDED, }; /* Default mitigation for Register File Data Sampling */ static enum rfds_mitigations rfds_mitigation __ro_after_init = IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF; static const char * const rfds_strings[] = { [RFDS_MITIGATION_OFF] = "Vulnerable", [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File", [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", }; static void __init rfds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) { rfds_mitigation = RFDS_MITIGATION_OFF; return; } if (rfds_mitigation == RFDS_MITIGATION_OFF) return; if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; } static __init int rfds_parse_cmdline(char *str) { if (!str) return -EINVAL; if (!boot_cpu_has_bug(X86_BUG_RFDS)) return 0; if (!strcmp(str, "off")) rfds_mitigation = RFDS_MITIGATION_OFF; else if (!strcmp(str, "on")) rfds_mitigation = RFDS_MITIGATION_VERW; return 0; } early_param("reg_file_data_sampling", rfds_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "" fmt static void __init md_clear_update_mitigation(void) { if (cpu_mitigations_off()) return; if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) goto out; /* * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO * Stale Data mitigation, if necessary. */ if (mds_mitigation == MDS_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_MDS)) { mds_mitigation = MDS_MITIGATION_FULL; mds_select_mitigation(); } if (taa_mitigation == TAA_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_TAA)) { taa_mitigation = TAA_MITIGATION_VERW; taa_select_mitigation(); } /* * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state. */ if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { mmio_mitigation = MMIO_MITIGATION_VERW; mmio_select_mitigation(); } if (rfds_mitigation == RFDS_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_RFDS)) { rfds_mitigation = RFDS_MITIGATION_VERW; rfds_select_mitigation(); } out: if (boot_cpu_has_bug(X86_BUG_MDS)) pr_info("MDS: %s\n", mds_strings[mds_mitigation]); if (boot_cpu_has_bug(X86_BUG_TAA)) pr_info("TAA: %s\n", taa_strings[taa_mitigation]); if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) pr_info("MMIO Stale Data: Unknown: No mitigations\n"); if (boot_cpu_has_bug(X86_BUG_RFDS)) pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]); } static void __init md_clear_select_mitigation(void) { mds_select_mitigation(); taa_select_mitigation(); mmio_select_mitigation(); rfds_select_mitigation(); /* * As these mitigations are inter-related and rely on VERW instruction * to clear the microarchitural buffers, update and print their status * after mitigation selection is done for each of these vulnerabilities. */ md_clear_update_mitigation(); } #undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { SRBDS_MITIGATION_OFF, SRBDS_MITIGATION_UCODE_NEEDED, SRBDS_MITIGATION_FULL, SRBDS_MITIGATION_TSX_OFF, SRBDS_MITIGATION_HYPERVISOR, }; static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; static const char * const srbds_strings[] = { [SRBDS_MITIGATION_OFF] = "Vulnerable", [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", }; static bool srbds_off; void update_srbds_msr(void) { u64 mcu_ctrl; if (!boot_cpu_has_bug(X86_BUG_SRBDS)) return; if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return; if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) return; /* * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX * being disabled and it hasn't received the SRBDS MSR microcode. */ if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) return; rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); switch (srbds_mitigation) { case SRBDS_MITIGATION_OFF: case SRBDS_MITIGATION_TSX_OFF: mcu_ctrl |= RNGDS_MITG_DIS; break; case SRBDS_MITIGATION_FULL: mcu_ctrl &= ~RNGDS_MITG_DIS; break; default: break; } wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); } static void __init srbds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_SRBDS)) return; /* * Check to see if this is one of the MDS_NO systems supporting TSX that * are only exposed to SRBDS when TSX is enabled or when CPU is affected * by Processor MMIO Stale Data vulnerability. */ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; else if (cpu_mitigations_off() || srbds_off) srbds_mitigation = SRBDS_MITIGATION_OFF; update_srbds_msr(); pr_info("%s\n", srbds_strings[srbds_mitigation]); } static int __init srbds_parse_cmdline(char *str) { if (!str) return -EINVAL; if (!boot_cpu_has_bug(X86_BUG_SRBDS)) return 0; srbds_off = !strcmp(str, "off"); return 0; } early_param("srbds", srbds_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "L1D Flush : " fmt enum l1d_flush_mitigations { L1D_FLUSH_OFF = 0, L1D_FLUSH_ON, }; static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF; static void __init l1d_flush_select_mitigation(void) { if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) return; static_branch_enable(&switch_mm_cond_l1d_flush); pr_info("Conditional flush on switch_mm() enabled\n"); } static int __init l1d_flush_parse_cmdline(char *str) { if (!strcmp(str, "on")) l1d_flush_mitigation = L1D_FLUSH_ON; return 0; } early_param("l1d_flush", l1d_flush_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "GDS: " fmt enum gds_mitigations { GDS_MITIGATION_OFF, GDS_MITIGATION_UCODE_NEEDED, GDS_MITIGATION_FORCE, GDS_MITIGATION_FULL, GDS_MITIGATION_FULL_LOCKED, GDS_MITIGATION_HYPERVISOR, }; #if IS_ENABLED(CONFIG_MITIGATION_GDS_FORCE) static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; #else static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; #endif static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode", [GDS_MITIGATION_FULL] = "Mitigation: Microcode", [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)", [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", }; bool gds_ucode_mitigated(void) { return (gds_mitigation == GDS_MITIGATION_FULL || gds_mitigation == GDS_MITIGATION_FULL_LOCKED); } EXPORT_SYMBOL_GPL(gds_ucode_mitigated); void update_gds_msr(void) { u64 mcu_ctrl_after; u64 mcu_ctrl; switch (gds_mitigation) { case GDS_MITIGATION_OFF: rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl |= GDS_MITG_DIS; break; case GDS_MITIGATION_FULL_LOCKED: /* * The LOCKED state comes from the boot CPU. APs might not have * the same state. Make sure the mitigation is enabled on all * CPUs. */ case GDS_MITIGATION_FULL: rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl &= ~GDS_MITG_DIS; break; case GDS_MITIGATION_FORCE: case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: return; } wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); /* * Check to make sure that the WRMSR value was not ignored. Writes to * GDS_MITG_DIS will be ignored if this processor is locked but the boot * processor was not. */ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); } static void __init gds_select_mitigation(void) { u64 mcu_ctrl; if (!boot_cpu_has_bug(X86_BUG_GDS)) return; if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { gds_mitigation = GDS_MITIGATION_HYPERVISOR; goto out; } if (cpu_mitigations_off()) gds_mitigation = GDS_MITIGATION_OFF; /* Will verify below that mitigation _can_ be disabled */ /* No microcode */ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { if (gds_mitigation == GDS_MITIGATION_FORCE) { /* * This only needs to be done on the boot CPU so do it * here rather than in update_gds_msr() */ setup_clear_cpu_cap(X86_FEATURE_AVX); pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); } else { gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; } goto out; } /* Microcode has mitigation, use it */ if (gds_mitigation == GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_FULL; rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); if (mcu_ctrl & GDS_MITG_LOCKED) { if (gds_mitigation == GDS_MITIGATION_OFF) pr_warn("Mitigation locked. Disable failed.\n"); /* * The mitigation is selected from the boot CPU. All other CPUs * _should_ have the same state. If the boot CPU isn't locked * but others are then update_gds_msr() will WARN() of the state * mismatch. If the boot CPU is locked update_gds_msr() will * ensure the other CPUs have the mitigation enabled. */ gds_mitigation = GDS_MITIGATION_FULL_LOCKED; } update_gds_msr(); out: pr_info("%s\n", gds_strings[gds_mitigation]); } static int __init gds_parse_cmdline(char *str) { if (!str) return -EINVAL; if (!boot_cpu_has_bug(X86_BUG_GDS)) return 0; if (!strcmp(str, "off")) gds_mitigation = GDS_MITIGATION_OFF; else if (!strcmp(str, "force")) gds_mitigation = GDS_MITIGATION_FORCE; return 0; } early_param("gather_data_sampling", gds_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { SPECTRE_V1_MITIGATION_NONE, SPECTRE_V1_MITIGATION_AUTO, }; static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init = SPECTRE_V1_MITIGATION_AUTO; static const char * const spectre_v1_strings[] = { [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers", [SPECTRE_V1_MITIGATION_AUTO] = "Mitigation: usercopy/swapgs barriers and __user pointer sanitization", }; /* * Does SMAP provide full mitigation against speculative kernel access to * userspace? */ static bool smap_works_speculatively(void) { if (!boot_cpu_has(X86_FEATURE_SMAP)) return false; /* * On CPUs which are vulnerable to Meltdown, SMAP does not * prevent speculative access to user data in the L1 cache. * Consider SMAP to be non-functional as a mitigation on these * CPUs. */ if (boot_cpu_has(X86_BUG_CPU_MELTDOWN)) return false; return true; } static void __init spectre_v1_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; return; } if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { /* * With Spectre v1, a user can speculatively control either * path of a conditional swapgs with a user-controlled GS * value. The mitigation is to add lfences to both code paths. * * If FSGSBASE is enabled, the user can put a kernel address in * GS, in which case SMAP provides no protection. * * If FSGSBASE is disabled, the user can only put a user space * address in GS. That makes an attack harder, but still * possible if there's no SMAP protection. */ if (boot_cpu_has(X86_FEATURE_FSGSBASE) || !smap_works_speculatively()) { /* * Mitigation can be provided from SWAPGS itself or * PTI as the CR3 write in the Meltdown mitigation * is serializing. * * If neither is there, mitigate with an LFENCE to * stop speculation through swapgs. */ if (boot_cpu_has_bug(X86_BUG_SWAPGS) && !boot_cpu_has(X86_FEATURE_PTI)) setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_USER); /* * Enable lfences in the kernel entry (non-swapgs) * paths, to prevent user entry from speculatively * skipping swapgs. */ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_KERNEL); } } pr_info("%s\n", spectre_v1_strings[spectre_v1_mitigation]); } static int __init nospectre_v1_cmdline(char *str) { spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; return 0; } early_param("nospectre_v1", nospectre_v1_cmdline); enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; #undef pr_fmt #define pr_fmt(fmt) "RETBleed: " fmt enum retbleed_mitigation { RETBLEED_MITIGATION_NONE, RETBLEED_MITIGATION_UNRET, RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, RETBLEED_MITIGATION_EIBRS, RETBLEED_MITIGATION_STUFF, }; enum retbleed_mitigation_cmd { RETBLEED_CMD_OFF, RETBLEED_CMD_AUTO, RETBLEED_CMD_UNRET, RETBLEED_CMD_IBPB, RETBLEED_CMD_STUFF, }; static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_NONE] = "Vulnerable", [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing", }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = RETBLEED_MITIGATION_NONE; static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = RETBLEED_CMD_AUTO; static int __ro_after_init retbleed_nosmt = false; static int __init retbleed_parse_cmdline(char *str) { if (!str) return -EINVAL; while (str) { char *next = strchr(str, ','); if (next) { *next = 0; next++; } if (!strcmp(str, "off")) { retbleed_cmd = RETBLEED_CMD_OFF; } else if (!strcmp(str, "auto")) { retbleed_cmd = RETBLEED_CMD_AUTO; } else if (!strcmp(str, "unret")) { retbleed_cmd = RETBLEED_CMD_UNRET; } else if (!strcmp(str, "ibpb")) { retbleed_cmd = RETBLEED_CMD_IBPB; } else if (!strcmp(str, "stuff")) { retbleed_cmd = RETBLEED_CMD_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; } else if (!strcmp(str, "force")) { setup_force_cpu_bug(X86_BUG_RETBLEED); } else { pr_err("Ignoring unknown retbleed option (%s).", str); } str = next; } return 0; } early_param("retbleed", retbleed_parse_cmdline); #define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" #define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" static void __init retbleed_select_mitigation(void) { bool mitigate_smt = false; if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) return; switch (retbleed_cmd) { case RETBLEED_CMD_OFF: return; case RETBLEED_CMD_UNRET: if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else { pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n"); goto do_cmd_auto; } break; case RETBLEED_CMD_IBPB: if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); goto do_cmd_auto; } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); goto do_cmd_auto; } break; case RETBLEED_CMD_STUFF: if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else { if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); else pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); goto do_cmd_auto; } break; do_cmd_auto: case RETBLEED_CMD_AUTO: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) retbleed_mitigation = RETBLEED_MITIGATION_UNRET; else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } /* * The Intel mitigation (IBRS or eIBRS) was already selected in * spectre_v2_select_mitigation(). 'retbleed_mitigation' will * be set accordingly below. */ break; } switch (retbleed_mitigation) { case RETBLEED_MITIGATION_UNRET: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); x86_return_thunk = retbleed_return_thunk; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) pr_err(RETBLEED_UNTRAIN_MSG); mitigate_smt = true; break; case RETBLEED_MITIGATION_IBPB: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); mitigate_smt = true; break; case RETBLEED_MITIGATION_STUFF: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); x86_return_thunk = call_depth_return_thunk; break; default: break; } if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && (retbleed_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); /* * Let IBRS trump all on Intel without affecting the effects of the * retbleed= cmdline option except for call depth based stuffing */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { switch (spectre_v2_enabled) { case SPECTRE_V2_IBRS: retbleed_mitigation = RETBLEED_MITIGATION_IBRS; break; case SPECTRE_V2_EIBRS: case SPECTRE_V2_EIBRS_RETPOLINE: case SPECTRE_V2_EIBRS_LFENCE: retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; break; default: if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) pr_err(RETBLEED_INTEL_MSG); } } pr_info("%s\n", retbleed_strings[retbleed_mitigation]); } #undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = SPECTRE_V2_USER_NONE; static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = SPECTRE_V2_USER_NONE; #ifdef CONFIG_MITIGATION_RETPOLINE static bool spectre_v2_bad_module; bool retpoline_module_ok(bool has_retpoline) { if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) return true; pr_err("System may be vulnerable to spectre v2\n"); spectre_v2_bad_module = true; return false; } static inline const char *spectre_v2_module_string(void) { return spectre_v2_bad_module ? " - vulnerable module loaded" : ""; } #else static inline const char *spectre_v2_module_string(void) { return ""; } #endif #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" #define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" #define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" #ifdef CONFIG_BPF_SYSCALL void unpriv_ebpf_notify(int new_state) { if (new_state) return; /* Unprivileged eBPF is enabled */ switch (spectre_v2_enabled) { case SPECTRE_V2_EIBRS: pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); break; case SPECTRE_V2_EIBRS_LFENCE: if (sched_smt_active()) pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); break; default: break; } } #endif static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); return len == arglen && !strncmp(arg, opt, len); } /* The kernel command line selection for spectre v2 */ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_NONE, SPECTRE_V2_CMD_AUTO, SPECTRE_V2_CMD_FORCE, SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, SPECTRE_V2_CMD_RETPOLINE_LFENCE, SPECTRE_V2_CMD_EIBRS, SPECTRE_V2_CMD_EIBRS_RETPOLINE, SPECTRE_V2_CMD_EIBRS_LFENCE, SPECTRE_V2_CMD_IBRS, }; enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, SPECTRE_V2_USER_CMD_FORCE, SPECTRE_V2_USER_CMD_PRCTL, SPECTRE_V2_USER_CMD_PRCTL_IBPB, SPECTRE_V2_USER_CMD_SECCOMP, SPECTRE_V2_USER_CMD_SECCOMP_IBPB, }; static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", [SPECTRE_V2_USER_STRICT_PREFERRED] = "User space: Mitigation: STIBP always-on protection", [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", }; static const struct { const char *option; enum spectre_v2_user_cmd cmd; bool secure; } v2_user_options[] __initconst = { { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, { "off", SPECTRE_V2_USER_CMD_NONE, false }, { "on", SPECTRE_V2_USER_CMD_FORCE, true }, { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, }; static void __init spec_v2_user_print_cond(const char *reason, bool secure) { if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) { char arg[20]; int ret, i; switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return SPECTRE_V2_USER_CMD_NONE; case SPECTRE_V2_CMD_FORCE: return SPECTRE_V2_USER_CMD_FORCE; default: break; } ret = cmdline_find_option(boot_command_line, "spectre_v2_user", arg, sizeof(arg)); if (ret < 0) return SPECTRE_V2_USER_CMD_AUTO; for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { if (match_option(arg, ret, v2_user_options[i].option)) { spec_v2_user_print_cond(v2_user_options[i].option, v2_user_options[i].secure); return v2_user_options[i].cmd; } } pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); return SPECTRE_V2_USER_CMD_AUTO; } static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; } static void __init spectre_v2_user_select_mitigation(void) { enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; bool smt_possible = IS_ENABLED(CONFIG_SMP); enum spectre_v2_user_cmd cmd; if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || cpu_smt_control == CPU_SMT_NOT_SUPPORTED) smt_possible = false; cmd = spectre_v2_parse_user_cmdline(); switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: goto set_mode; case SPECTRE_V2_USER_CMD_FORCE: mode = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_PRCTL: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: mode = SPECTRE_V2_USER_PRCTL; break; case SPECTRE_V2_USER_CMD_SECCOMP: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: if (IS_ENABLED(CONFIG_SECCOMP)) mode = SPECTRE_V2_USER_SECCOMP; else mode = SPECTRE_V2_USER_PRCTL; break; } /* Initialize Indirect Branch Prediction Barrier */ if (boot_cpu_has(X86_FEATURE_IBPB)) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB); spectre_v2_user_ibpb = mode; switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: break; case SPECTRE_V2_USER_CMD_FORCE: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: static_branch_enable(&switch_mm_always_ibpb); spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_PRCTL: case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_SECCOMP: static_branch_enable(&switch_mm_cond_ibpb); break; } pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", static_key_enabled(&switch_mm_always_ibpb) ? "always-on" : "conditional"); } /* * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP * is not required. * * Intel's Enhanced IBRS also protects against cross-thread branch target * injection in user-mode as the IBRS bit remains always set which * implicitly enables cross-thread protections. However, in legacy IBRS * mode, the IBRS bit is set only on kernel entry and cleared on return * to userspace. AMD Automatic IBRS also does not protect userspace. * These modes therefore disable the implicit cross-thread protection, * so allow for STIBP to be selected in those cases. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && !boot_cpu_has(X86_FEATURE_AUTOIBRS))) return; /* * At this point, an STIBP mode other than "off" has been set. * If STIBP support is not being forced, check if STIBP always-on * is preferred. */ if (mode != SPECTRE_V2_USER_STRICT && boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) mode = SPECTRE_V2_USER_STRICT_PREFERRED; if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { if (mode != SPECTRE_V2_USER_STRICT && mode != SPECTRE_V2_USER_STRICT_PREFERRED) pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); mode = SPECTRE_V2_USER_STRICT_PREFERRED; } spectre_v2_user_stibp = mode; set_mode: pr_info("%s\n", spectre_v2_user_strings[mode]); } static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; static const struct { const char *option; enum spectre_v2_mitigation_cmd cmd; bool secure; } mitigation_options[] __initconst = { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, { "ibrs", SPECTRE_V2_CMD_IBRS, false }, }; static void __init spec_v2_print_cond(const char *reason, bool secure) { if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) pr_info("%s selected on command line.\n", reason); } static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) { enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; char arg[20]; int ret, i; if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); if (ret < 0) return SPECTRE_V2_CMD_AUTO; for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { if (!match_option(arg, ret, mitigation_options[i].option)) continue; cmd = mitigation_options[i].cmd; break; } if (i >= ARRAY_SIZE(mitigation_options)) { pr_err("unknown option (%s). Switching to AUTO select\n", arg); return SPECTRE_V2_CMD_AUTO; } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if ((cmd == SPECTRE_V2_CMD_EIBRS || cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; } static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { pr_err("Kernel not compiled with retpoline; no mitigation available!"); return SPECTRE_V2_NONE; } return SPECTRE_V2_RETPOLINE; } static bool __ro_after_init rrsba_disabled; /* Disable in-kernel use of non-RSB RET predictors */ static void __init spec_ctrl_disable_kernel_rrsba(void) { if (rrsba_disabled) return; if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) { rrsba_disabled = true; return; } if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) return; x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; update_spec_ctrl(x86_spec_ctrl_base); rrsba_disabled = true; } static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) { /* * Similar to context switches, there are two types of RSB attacks * after VM exit: * * 1) RSB underflow * * 2) Poisoned RSB entry * * When retpoline is enabled, both are mitigated by filling/clearing * the RSB. * * When IBRS is enabled, while #1 would be mitigated by the IBRS branch * prediction isolation protections, RSB still needs to be cleared * because of #2. Note that SMEP provides no protection here, unlike * user-space-poisoned RSB entries. * * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB * bug is present then a LITE version of RSB protection is required, * just a single call needs to retire before a RET is executed. */ switch (mode) { case SPECTRE_V2_NONE: return; case SPECTRE_V2_EIBRS_LFENCE: case SPECTRE_V2_EIBRS: if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); } return; case SPECTRE_V2_EIBRS_RETPOLINE: case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); return; } pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); dump_stack(); } /* * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by * branch history in userspace. Not needed if BHI_NO is set. */ static bool __init spec_ctrl_bhi_dis(void) { if (!boot_cpu_has(X86_FEATURE_BHI_CTRL)) return false; x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S; update_spec_ctrl(x86_spec_ctrl_base); setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW); return true; } enum bhi_mitigations { BHI_MITIGATION_OFF, BHI_MITIGATION_ON, }; static enum bhi_mitigations bhi_mitigation __ro_after_init = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { if (!str) return -EINVAL; if (!strcmp(str, "off")) bhi_mitigation = BHI_MITIGATION_OFF; else if (!strcmp(str, "on")) bhi_mitigation = BHI_MITIGATION_ON; else pr_err("Ignoring unknown spectre_bhi option (%s)", str); return 0; } early_param("spectre_bhi", spectre_bhi_parse_cmdline); static void __init bhi_select_mitigation(void) { if (bhi_mitigation == BHI_MITIGATION_OFF) return; /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ if (boot_cpu_has(X86_FEATURE_RETPOLINE) && !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) { spec_ctrl_disable_kernel_rrsba(); if (rrsba_disabled) return; } if (spec_ctrl_bhi_dis()) return; if (!IS_ENABLED(CONFIG_X86_64)) return; /* Mitigate KVM by default */ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n"); /* Mitigate syscalls when the mitigation is forced =on */ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n"); } static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; /* * If the CPU is not affected and the command line mode is NONE or AUTO * then nothing to do. */ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) return; switch (cmd) { case SPECTRE_V2_CMD_NONE: return; case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { mode = SPECTRE_V2_EIBRS; break; } if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && boot_cpu_has_bug(X86_BUG_RETBLEED) && retbleed_cmd != RETBLEED_CMD_OFF && retbleed_cmd != RETBLEED_CMD_STUFF && boot_cpu_has(X86_FEATURE_IBRS) && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { mode = SPECTRE_V2_IBRS; break; } mode = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_RETPOLINE_LFENCE: pr_err(SPECTRE_V2_LFENCE_MSG); mode = SPECTRE_V2_LFENCE; break; case SPECTRE_V2_CMD_RETPOLINE_GENERIC: mode = SPECTRE_V2_RETPOLINE; break; case SPECTRE_V2_CMD_RETPOLINE: mode = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_IBRS: mode = SPECTRE_V2_IBRS; break; case SPECTRE_V2_CMD_EIBRS: mode = SPECTRE_V2_EIBRS; break; case SPECTRE_V2_CMD_EIBRS_LFENCE: mode = SPECTRE_V2_EIBRS_LFENCE; break; case SPECTRE_V2_CMD_EIBRS_RETPOLINE: mode = SPECTRE_V2_EIBRS_RETPOLINE; break; } if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); if (spectre_v2_in_ibrs_mode(mode)) { if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); } else { x86_spec_ctrl_base |= SPEC_CTRL_IBRS; update_spec_ctrl(x86_spec_ctrl_base); } } switch (mode) { case SPECTRE_V2_NONE: case SPECTRE_V2_EIBRS: break; case SPECTRE_V2_IBRS: setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) pr_warn(SPECTRE_V2_IBRS_PERF_MSG); break; case SPECTRE_V2_LFENCE: case SPECTRE_V2_EIBRS_LFENCE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); fallthrough; case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_EIBRS_RETPOLINE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); break; } /* * Disable alternate RSB predictions in kernel when indirect CALLs and * JMPs gets protection against BHI and Intramode-BTI, but RET * prediction from a non-RSB predictor is still a risk. */ if (mode == SPECTRE_V2_EIBRS_LFENCE || mode == SPECTRE_V2_EIBRS_RETPOLINE || mode == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); if (boot_cpu_has(X86_BUG_BHI)) bhi_select_mitigation(); spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); /* * If Spectre v2 protection has been enabled, fill the RSB during a * context switch. In general there are two types of RSB attacks * across context switches, for which the CALLs/RETs may be unbalanced. * * 1) RSB underflow * * Some Intel parts have "bottomless RSB". When the RSB is empty, * speculated return targets may come from the branch predictor, * which could have a user-poisoned BTB or BHB entry. * * AMD has it even worse: *all* returns are speculated from the BTB, * regardless of the state of the RSB. * * When IBRS or eIBRS is enabled, the "user -> kernel" attack * scenario is mitigated by the IBRS branch prediction isolation * properties, so the RSB buffer filling wouldn't be necessary to * protect against this type of attack. * * The "user -> user" attack scenario is mitigated by RSB filling. * * 2) Poisoned RSB entry * * If the 'next' in-kernel return stack is shorter than 'prev', * 'next' could be tricked into speculating with a user-poisoned RSB * entry. * * The "user -> kernel" attack scenario is mitigated by SMEP and * eIBRS. * * The "user -> user" scenario, also known as SpectreBHB, requires * RSB clearing. * * So to mitigate all cases, unconditionally fill RSB on context * switches. * * FIXME: Is this pointless for retbleed-affected AMD? */ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); spectre_v2_determine_rsb_fill_type_at_vmexit(mode); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS * and Enhanced IBRS protect firmware too, so enable IBRS around * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't * otherwise enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ if (boot_cpu_has_bug(X86_BUG_RETBLEED) && boot_cpu_has(X86_FEATURE_IBPB) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { if (retbleed_cmd != RETBLEED_CMD_IBPB) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); pr_info("Enabling Speculation Barrier for firmware calls\n"); } } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } /* Set up IBPB and STIBP depending on the general spectre V2 command */ spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) { u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); update_spec_ctrl(val); } /* Update x86_spec_ctrl_base in case SMT state changed. */ static void update_stibp_strict(void) { u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; if (sched_smt_active()) mask |= SPEC_CTRL_STIBP; if (mask == x86_spec_ctrl_base) return; pr_info("Update user space SMT mitigation: STIBP %s\n", mask & SPEC_CTRL_STIBP ? "always-on" : "off"); x86_spec_ctrl_base = mask; on_each_cpu(update_stibp_msr, NULL, 1); } /* Update the static key controlling the evaluation of TIF_SPEC_IB */ static void update_indir_branch_cond(void) { if (sched_smt_active()) static_branch_enable(&switch_to_cond_stibp); else static_branch_disable(&switch_to_cond_stibp); } #undef pr_fmt #define pr_fmt(fmt) fmt /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. * * The other variants cannot be mitigated when SMT is enabled, so * clearing the buffers on idle just to prevent the Store Buffer * repartitioning leak would be a window dressing exercise. */ if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) return; if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); } else if (mmio_mitigation == MMIO_MITIGATION_OFF || (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); } } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" #define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" #define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" void cpu_bugs_smt_update(void) { mutex_lock(&spec_ctrl_mutex); if (sched_smt_active() && unprivileged_ebpf_enabled() && spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; case SPECTRE_V2_USER_STRICT: case SPECTRE_V2_USER_STRICT_PREFERRED: update_stibp_strict(); break; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: update_indir_branch_cond(); break; } switch (mds_mitigation) { case MDS_MITIGATION_FULL: case MDS_MITIGATION_VMWERV: if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) pr_warn_once(MDS_MSG_SMT); update_mds_branch_idle(); break; case MDS_MITIGATION_OFF: break; } switch (taa_mitigation) { case TAA_MITIGATION_VERW: case TAA_MITIGATION_UCODE_NEEDED: if (sched_smt_active()) pr_warn_once(TAA_MSG_SMT); break; case TAA_MITIGATION_TSX_DISABLED: case TAA_MITIGATION_OFF: break; } switch (mmio_mitigation) { case MMIO_MITIGATION_VERW: case MMIO_MITIGATION_UCODE_NEEDED: if (sched_smt_active()) pr_warn_once(MMIO_MSG_SMT); break; case MMIO_MITIGATION_OFF: break; } mutex_unlock(&spec_ctrl_mutex); } #undef pr_fmt #define pr_fmt(fmt) "Speculative Store Bypass: " fmt static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; /* The kernel command line selection */ enum ssb_mitigation_cmd { SPEC_STORE_BYPASS_CMD_NONE, SPEC_STORE_BYPASS_CMD_AUTO, SPEC_STORE_BYPASS_CMD_ON, SPEC_STORE_BYPASS_CMD_PRCTL, SPEC_STORE_BYPASS_CMD_SECCOMP, }; static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_NONE] = "Vulnerable", [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", }; static const struct { const char *option; enum ssb_mitigation_cmd cmd; } ssb_mitigation_options[] __initconst = { { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ }; static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) { enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; char arg[20]; int ret, i; if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; } else { ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", arg, sizeof(arg)); if (ret < 0) return SPEC_STORE_BYPASS_CMD_AUTO; for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { if (!match_option(arg, ret, ssb_mitigation_options[i].option)) continue; cmd = ssb_mitigation_options[i].cmd; break; } if (i >= ARRAY_SIZE(ssb_mitigation_options)) { pr_err("unknown option (%s). Switching to AUTO select\n", arg); return SPEC_STORE_BYPASS_CMD_AUTO; } } return cmd; } static enum ssb_mitigation __init __ssb_select_mitigation(void) { enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; enum ssb_mitigation_cmd cmd; if (!boot_cpu_has(X86_FEATURE_SSBD)) return mode; cmd = ssb_parse_cmdline(); if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && (cmd == SPEC_STORE_BYPASS_CMD_NONE || cmd == SPEC_STORE_BYPASS_CMD_AUTO)) return mode; switch (cmd) { case SPEC_STORE_BYPASS_CMD_SECCOMP: /* * Choose prctl+seccomp as the default mode if seccomp is * enabled. */ if (IS_ENABLED(CONFIG_SECCOMP)) mode = SPEC_STORE_BYPASS_SECCOMP; else mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_ON: mode = SPEC_STORE_BYPASS_DISABLE; break; case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_PRCTL: mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_NONE: break; } /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation */ if (mode == SPEC_STORE_BYPASS_DISABLE) { setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); /* * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may * use a completely different MSR and bit dependent on family. */ if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && !static_cpu_has(X86_FEATURE_AMD_SSBD)) { x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; update_spec_ctrl(x86_spec_ctrl_base); } } return mode; } static void ssb_select_mitigation(void) { ssb_mode = __ssb_select_mitigation(); if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) pr_info("%s\n", ssb_strings[ssb_mode]); } #undef pr_fmt #define pr_fmt(fmt) "Speculation prctl: " fmt static void task_update_spec_tif(struct task_struct *tsk) { /* Force the update of the real TIF bits */ set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE); /* * Immediately update the speculation control MSRs for the current * task, but for a non-current task delay setting the CPU * mitigation until it is scheduled next. * * This can only happen for SECCOMP mitigation. For PRCTL it's * always the current task. */ if (tsk == current) speculation_ctrl_update_current(); } static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl) { if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) return -EPERM; switch (ctrl) { case PR_SPEC_ENABLE: set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); return 0; case PR_SPEC_DISABLE: clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); return 0; default: return -ERANGE; } } static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) { if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && ssb_mode != SPEC_STORE_BYPASS_SECCOMP) return -ENXIO; switch (ctrl) { case PR_SPEC_ENABLE: /* If speculation is force disabled, enable is not allowed */ if (task_spec_ssb_force_disable(task)) return -EPERM; task_clear_spec_ssb_disable(task); task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_DISABLE: task_set_spec_ssb_disable(task); task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_FORCE_DISABLE: task_set_spec_ssb_disable(task); task_set_spec_ssb_force_disable(task); task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_DISABLE_NOEXEC: if (task_spec_ssb_force_disable(task)) return -EPERM; task_set_spec_ssb_disable(task); task_set_spec_ssb_noexec(task); task_update_spec_tif(task); break; default: return -ERANGE; } return 0; } static bool is_spec_ib_user_controlled(void) { return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP; } static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) { switch (ctrl) { case PR_SPEC_ENABLE: if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return 0; /* * With strict mode for both IBPB and STIBP, the instruction * code paths avoid checking this task flag and instead, * unconditionally run the instruction. However, STIBP and IBPB * are independent and either can be set to conditionally * enabled regardless of the mode of the other. * * If either is set to conditional, allow the task flag to be * updated, unless it was force-disabled by a previous prctl * call. Currently, this is possible on an AMD CPU which has the * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the * kernel is booted with 'spectre_v2_user=seccomp', then * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED. */ if (!is_spec_ib_user_controlled() || task_spec_ib_force_disable(task)) return -EPERM; task_clear_spec_ib_disable(task); task_update_spec_tif(task); break; case PR_SPEC_DISABLE: case PR_SPEC_FORCE_DISABLE: /* * Indirect branch speculation is always allowed when * mitigation is force disabled. */ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return -EPERM; if (!is_spec_ib_user_controlled()) return 0; task_set_spec_ib_disable(task); if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); task_update_spec_tif(task); if (task == current) indirect_branch_prediction_barrier(); break; default: return -ERANGE; } return 0; } int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, unsigned long ctrl) { switch (which) { case PR_SPEC_STORE_BYPASS: return ssb_prctl_set(task, ctrl); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_set(task, ctrl); case PR_SPEC_L1D_FLUSH: return l1d_flush_prctl_set(task, ctrl); default: return -ENODEV; } } #ifdef CONFIG_SECCOMP void arch_seccomp_spec_mitigate(struct task_struct *task) { if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); } #endif static int l1d_flush_prctl_get(struct task_struct *task) { if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) return PR_SPEC_FORCE_DISABLE; if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH)) return PR_SPEC_PRCTL | PR_SPEC_ENABLE; else return PR_SPEC_PRCTL | PR_SPEC_DISABLE; } static int ssb_prctl_get(struct task_struct *task) { switch (ssb_mode) { case SPEC_STORE_BYPASS_NONE: if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) return PR_SPEC_ENABLE; return PR_SPEC_NOT_AFFECTED; case SPEC_STORE_BYPASS_DISABLE: return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: case SPEC_STORE_BYPASS_PRCTL: if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ssb_noexec(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC; if (task_spec_ssb_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; } BUG(); } static int ib_prctl_get(struct task_struct *task) { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return PR_SPEC_NOT_AFFECTED; if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return PR_SPEC_ENABLE; else if (is_spec_ib_user_controlled()) { if (task_spec_ib_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ib_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) return PR_SPEC_DISABLE; else return PR_SPEC_NOT_AFFECTED; } int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) { switch (which) { case PR_SPEC_STORE_BYPASS: return ssb_prctl_get(task); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_get(task); case PR_SPEC_L1D_FLUSH: return l1d_flush_prctl_get(task); default: return -ENODEV; } } void x86_spec_ctrl_setup_ap(void) { if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) update_spec_ctrl(x86_spec_ctrl_base); if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) x86_amd_ssb_disable(); } bool itlb_multihit_kvm_mitigation; EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt /* Default mitigation for L1TF-affected CPUs */ enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH; #if IS_ENABLED(CONFIG_KVM_INTEL) EXPORT_SYMBOL_GPL(l1tf_mitigation); #endif enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation); /* * These CPUs all support 44bits physical address space internally in the * cache but CPUID can report a smaller number of physical address bits. * * The L1TF mitigation uses the top most address bit for the inversion of * non present PTEs. When the installed memory reaches into the top most * address bit due to memory holes, which has been observed on machines * which report 36bits physical address bits and have 32G RAM installed, * then the mitigation range check in l1tf_select_mitigation() triggers. * This is a false positive because the mitigation is still possible due to * the fact that the cache uses 44bit internally. Use the cache bits * instead of the reported physical bits and adjust them on the affected * machines to 44bit if the reported bits are less than 44. */ static void override_cache_bits(struct cpuinfo_x86 *c) { if (c->x86 != 6) return; switch (c->x86_vfm) { case INTEL_NEHALEM: case INTEL_WESTMERE: case INTEL_SANDYBRIDGE: case INTEL_IVYBRIDGE: case INTEL_HASWELL: case INTEL_HASWELL_L: case INTEL_HASWELL_G: case INTEL_BROADWELL: case INTEL_BROADWELL_G: case INTEL_SKYLAKE_L: case INTEL_SKYLAKE: case INTEL_KABYLAKE_L: case INTEL_KABYLAKE: if (c->x86_cache_bits < 44) c->x86_cache_bits = 44; break; } } static void __init l1tf_select_mitigation(void) { u64 half_pa; if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; if (cpu_mitigations_off()) l1tf_mitigation = L1TF_MITIGATION_OFF; else if (cpu_mitigations_auto_nosmt()) l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: break; case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: cpu_smt_disable(false); break; case L1TF_MITIGATION_FULL_FORCE: cpu_smt_disable(true); break; } #if CONFIG_PGTABLE_LEVELS == 2 pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); return; #endif half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; if (l1tf_mitigation != L1TF_MITIGATION_OFF && e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) { pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", half_pa); pr_info("However, doing so will make a part of your RAM unusable.\n"); pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n"); return; } setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); } static int __init l1tf_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_L1TF)) return 0; if (!str) return -EINVAL; if (!strcmp(str, "off")) l1tf_mitigation = L1TF_MITIGATION_OFF; else if (!strcmp(str, "flush,nowarn")) l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN; else if (!strcmp(str, "flush")) l1tf_mitigation = L1TF_MITIGATION_FLUSH; else if (!strcmp(str, "flush,nosmt")) l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; else if (!strcmp(str, "full")) l1tf_mitigation = L1TF_MITIGATION_FULL; else if (!strcmp(str, "full,force")) l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE; return 0; } early_param("l1tf", l1tf_cmdline); #undef pr_fmt #define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt enum srso_mitigation { SRSO_MITIGATION_NONE, SRSO_MITIGATION_UCODE_NEEDED, SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, SRSO_MITIGATION_MICROCODE, SRSO_MITIGATION_SAFE_RET, SRSO_MITIGATION_IBPB, SRSO_MITIGATION_IBPB_ON_VMEXIT, }; enum srso_mitigation_cmd { SRSO_CMD_OFF, SRSO_CMD_MICROCODE, SRSO_CMD_SAFE_RET, SRSO_CMD_IBPB, SRSO_CMD_IBPB_ON_VMEXIT, }; static const char * const srso_strings[] = { [SRSO_MITIGATION_NONE] = "Vulnerable", [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode", [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET", [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET", [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" }; static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; static int __init srso_parse_cmdline(char *str) { if (!str) return -EINVAL; if (!strcmp(str, "off")) srso_cmd = SRSO_CMD_OFF; else if (!strcmp(str, "microcode")) srso_cmd = SRSO_CMD_MICROCODE; else if (!strcmp(str, "safe-ret")) srso_cmd = SRSO_CMD_SAFE_RET; else if (!strcmp(str, "ibpb")) srso_cmd = SRSO_CMD_IBPB; else if (!strcmp(str, "ibpb-vmexit")) srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; else pr_err("Ignoring unknown SRSO option (%s).", str); return 0; } early_param("spec_rstack_overflow", srso_parse_cmdline); #define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options." static void __init srso_select_mitigation(void) { bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); if (cpu_mitigations_off()) return; if (!boot_cpu_has_bug(X86_BUG_SRSO)) { if (boot_cpu_has(X86_FEATURE_SBPB)) x86_pred_cmd = PRED_CMD_SBPB; return; } if (has_microcode) { /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. * * Zen1/2 don't have SBPB, no need to try to enable it here. */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); return; } if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { srso_mitigation = SRSO_MITIGATION_IBPB; goto out; } } else { pr_warn("IBPB-extending microcode not applied!\n"); pr_warn(SRSO_NOTICE); /* may be overwritten by SRSO_CMD_SAFE_RET below */ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } switch (srso_cmd) { case SRSO_CMD_OFF: if (boot_cpu_has(X86_FEATURE_SBPB)) x86_pred_cmd = PRED_CMD_SBPB; return; case SRSO_CMD_MICROCODE: if (has_microcode) { srso_mitigation = SRSO_MITIGATION_MICROCODE; pr_warn(SRSO_NOTICE); } break; case SRSO_CMD_SAFE_RET: if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { /* * Enable the return thunk for generated code * like ftrace, static_call, etc. */ setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); if (boot_cpu_data.x86 == 0x19) { setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); x86_return_thunk = srso_alias_return_thunk; } else { setup_force_cpu_cap(X86_FEATURE_SRSO); x86_return_thunk = srso_return_thunk; } if (has_microcode) srso_mitigation = SRSO_MITIGATION_SAFE_RET; else srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; } else { pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); } break; case SRSO_CMD_IBPB: if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); srso_mitigation = SRSO_MITIGATION_IBPB; } } else { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); } break; case SRSO_CMD_IBPB_ON_VMEXIT: if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; } } else { pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); } break; } out: pr_info("%s\n", srso_strings[srso_mitigation]); } #undef pr_fmt #define pr_fmt(fmt) fmt #ifdef CONFIG_SYSFS #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" #if IS_ENABLED(CONFIG_KVM_INTEL) static const char * const l1tf_vmx_states[] = { [VMENTER_L1D_FLUSH_AUTO] = "auto", [VMENTER_L1D_FLUSH_NEVER] = "vulnerable", [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes", [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes", [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled", [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary" }; static ssize_t l1tf_show_state(char *buf) { if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && sched_smt_active())) { return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, l1tf_vmx_states[l1tf_vmx_mitigation]); } return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, l1tf_vmx_states[l1tf_vmx_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t itlb_multihit_show_state(char *buf) { if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || !boot_cpu_has(X86_FEATURE_VMX)) return sysfs_emit(buf, "KVM: Mitigation: VMX unsupported\n"); else if (!(cr4_read_shadow() & X86_CR4_VMXE)) return sysfs_emit(buf, "KVM: Mitigation: VMX disabled\n"); else if (itlb_multihit_kvm_mitigation) return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n"); else return sysfs_emit(buf, "KVM: Vulnerable\n"); } #else static ssize_t l1tf_show_state(char *buf) { return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); } static ssize_t itlb_multihit_show_state(char *buf) { return sysfs_emit(buf, "Processor vulnerable\n"); } #endif static ssize_t mds_show_state(char *buf) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { return sysfs_emit(buf, "%s; SMT Host state unknown\n", mds_strings[mds_mitigation]); } if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : sched_smt_active() ? "mitigated" : "disabled")); } return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t tsx_async_abort_show_state(char *buf) { if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || (taa_mitigation == TAA_MITIGATION_OFF)) return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]); if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { return sysfs_emit(buf, "%s; SMT Host state unknown\n", taa_strings[taa_mitigation]); } return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t mmio_stale_data_show_state(char *buf) { if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) return sysfs_emit(buf, "Unknown: No mitigations\n"); if (mmio_mitigation == MMIO_MITIGATION_OFF) return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { return sysfs_emit(buf, "%s; SMT Host state unknown\n", mmio_strings[mmio_mitigation]); } return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t rfds_show_state(char *buf) { return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); } static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && !boot_cpu_has(X86_FEATURE_AUTOIBRS)) return ""; switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: return "; STIBP: disabled"; case SPECTRE_V2_USER_STRICT: return "; STIBP: forced"; case SPECTRE_V2_USER_STRICT_PREFERRED: return "; STIBP: always-on"; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: if (static_key_enabled(&switch_to_cond_stibp)) return "; STIBP: conditional"; } return ""; } static char *ibpb_state(void) { if (boot_cpu_has(X86_FEATURE_IBPB)) { if (static_key_enabled(&switch_mm_always_ibpb)) return "; IBPB: always-on"; if (static_key_enabled(&switch_mm_cond_ibpb)) return "; IBPB: conditional"; return "; IBPB: disabled"; } return ""; } static char *pbrsb_eibrs_state(void) { if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) return "; PBRSB-eIBRS: SW sequence"; else return "; PBRSB-eIBRS: Vulnerable"; } else { return "; PBRSB-eIBRS: Not affected"; } } static const char *spectre_bhi_state(void) { if (!boot_cpu_has_bug(X86_BUG_BHI)) return "; BHI: Not affected"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) return "; BHI: BHI_DIS_S"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) && rrsba_disabled) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Vulnerable, KVM: SW loop"; return "; BHI: Vulnerable"; } static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) return sysfs_emit(buf, "Vulnerable: LFENCE\n"); if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); if (sched_smt_active() && unprivileged_ebpf_enabled() && spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "", stibp_state(), boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "", pbrsb_eibrs_state(), spectre_bhi_state(), /* this should always be at the end */ spectre_v2_module_string()); } static ssize_t srbds_show_state(char *buf) { return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]); } static ssize_t retbleed_show_state(char *buf) { if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return sysfs_emit(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n"); return sysfs_emit(buf, "%s; SMT %s\n", retbleed_strings[retbleed_mitigation], !sched_smt_active() ? "disabled" : spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? "enabled with STIBP protection" : "vulnerable"); } return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]); } static ssize_t srso_show_state(char *buf) { if (boot_cpu_has(X86_FEATURE_SRSO_NO)) return sysfs_emit(buf, "Mitigation: SMT disabled\n"); return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]); } static ssize_t gds_show_state(char *buf) { return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]); } static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { if (!boot_cpu_has_bug(bug)) return sysfs_emit(buf, "Not affected\n"); switch (bug) { case X86_BUG_CPU_MELTDOWN: if (boot_cpu_has(X86_FEATURE_PTI)) return sysfs_emit(buf, "Mitigation: PTI\n"); if (hypervisor_is_type(X86_HYPER_XEN_PV)) return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); break; case X86_BUG_SPECTRE_V1: return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]); case X86_BUG_L1TF: if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) return l1tf_show_state(buf); break; case X86_BUG_MDS: return mds_show_state(buf); case X86_BUG_TAA: return tsx_async_abort_show_state(buf); case X86_BUG_ITLB_MULTIHIT: return itlb_multihit_show_state(buf); case X86_BUG_SRBDS: return srbds_show_state(buf); case X86_BUG_MMIO_STALE_DATA: case X86_BUG_MMIO_UNKNOWN: return mmio_stale_data_show_state(buf); case X86_BUG_RETBLEED: return retbleed_show_state(buf); case X86_BUG_SRSO: return srso_show_state(buf); case X86_BUG_GDS: return gds_show_state(buf); case X86_BUG_RFDS: return rfds_show_state(buf); default: break; } return sysfs_emit(buf, "Vulnerable\n"); } ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN); } ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1); } ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2); } ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); } ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); } ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_MDS); } ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_TAA); } ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); } ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); } ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) { if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); else return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); } ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_SRSO); } ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_GDS); } ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); } #endif void __warn_thunk(void) { WARN_ONCE(1, "Unpatched return thunk in use. This should not happen!\n"); } |
10 31 15 15 2 12 12 3074 3092 17 9 9 9 9 33 1 1 31 31 1 30 10 10 10 1 9 9 7 2 7 2 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 702 7 3 4 3 12 5 14 12 25 12 25 25 25 12 9 8 13 13 8 13 11 9 9 4 4 3 1 1 3 728 727 728 12 12 12 12 11 1 10 10 3 3 4 4 24 10 10 14 692 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 | // SPDX-License-Identifier: GPL-2.0 /* * gendisk handling * * Portions Copyright (C) 2020 Christoph Hellwig */ #include <linux/module.h> #include <linux/ctype.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/log2.h> #include <linux/pm_runtime.h> #include <linux/badblocks.h> #include <linux/part_stat.h> #include <linux/blktrace_api.h> #include "blk-throttle.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" #include "blk-cgroup.h" static struct kobject *block_depr; /* * Unique, monotonically increasing sequential number associated with block * devices instances (i.e. incremented each time a device is attached). * Associating uevents with block devices in userspace is difficult and racy: * the uevent netlink socket is lossy, and on slow and overloaded systems has * a very high latency. * Block devices do not have exclusive owners in userspace, any process can set * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 * can be reused again and again). * A userspace process setting up a block device and watching for its events * cannot thus reliably tell whether an event relates to the device it just set * up or another earlier instance with the same name. * This sequential number allows userspace processes to solve this problem, and * uniquely associate an uevent to the lifetime to a device. */ static atomic64_t diskseq; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) static DEFINE_IDA(ext_devt_ida); void set_capacity(struct gendisk *disk, sector_t sectors) { bdev_set_nr_sectors(disk->part0, sectors); } EXPORT_SYMBOL(set_capacity); /* * Set disk capacity and notify if the size is not currently zero and will not * be set to zero. Returns true if a uevent was sent, otherwise false. */ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); char *envp[] = { "RESIZE=1", NULL }; set_capacity(disk, size); /* * Only print a message and send a uevent if the gendisk is user visible * and alive. This avoids spamming the log and udev when setting the * initial capacity during probing. */ if (size == capacity || !disk_live(disk) || (disk->flags & GENHD_FL_HIDDEN)) return false; pr_info("%s: detected capacity change from %lld to %lld\n", disk->disk_name, capacity, size); /* * Historically we did not send a uevent for changes to/from an empty * device. */ if (!capacity || !size) return false; kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); return true; } EXPORT_SYMBOL_GPL(set_capacity_and_notify); static void part_stat_read_all(struct block_device *part, struct disk_stats *stat) { int cpu; memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { stat->nsecs[group] += ptr->nsecs[group]; stat->sectors[group] += ptr->sectors[group]; stat->ios[group] += ptr->ios[group]; stat->merges[group] += ptr->merges[group]; } stat->io_ticks += ptr->io_ticks; } } unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; for_each_possible_cpu(cpu) { inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + part_stat_local_read_cpu(part, in_flight[1], cpu); } if ((int)inflight < 0) inflight = 0; return inflight; } static void part_in_flight_rw(struct block_device *part, unsigned int inflight[2]) { int cpu; inflight[0] = 0; inflight[1] = 0; for_each_possible_cpu(cpu) { inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu); inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu); } if ((int)inflight[0] < 0) inflight[0] = 0; if ((int)inflight[1] < 0) inflight[1] = 0; } /* * Can be deleted altogether. Later. * */ #define BLKDEV_MAJOR_HASH_SIZE 255 static struct blk_major_name { struct blk_major_name *next; int major; char name[16]; #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD void (*probe)(dev_t devt); #endif } *major_names[BLKDEV_MAJOR_HASH_SIZE]; static DEFINE_MUTEX(major_names_lock); static DEFINE_SPINLOCK(major_names_spinlock); /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(unsigned major) { return major % BLKDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; spin_lock(&major_names_spinlock); for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); spin_unlock(&major_names_spinlock); } #endif /* CONFIG_PROC_FS */ /** * __register_blkdev - register a new block device * * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If * @major = 0, try to allocate any unused major number. * @name: the name of the new block device as a zero terminated string * @probe: pre-devtmpfs / pre-udev callback used to create disks when their * pre-created device node is accessed. When a probe call uses * add_disk() and it fails the driver must cleanup resources. This * interface may soon be removed. * * The @name must be unique within the system. * * The return value depends on the @major input parameter: * * - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1] * then the function returns zero on success, or a negative error code * - if any unused major number was requested with @major = 0 parameter * then the return value is the allocated major number in range * [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise * * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. * * Use register_blkdev instead for any new code. */ int __register_blkdev(unsigned int major, const char *name, void (*probe)(dev_t devt)) { struct blk_major_name **n, *p; int index, ret = 0; mutex_lock(&major_names_lock); /* temporary */ if (major == 0) { for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) { if (major_names[index] == NULL) break; } if (index == 0) { printk("%s: failed to get major for %s\n", __func__, name); ret = -EBUSY; goto out; } major = index; ret = major; } if (major >= BLKDEV_MAJOR_MAX) { pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n", __func__, major, BLKDEV_MAJOR_MAX-1, name); ret = -EINVAL; goto out; } p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL); if (p == NULL) { ret = -ENOMEM; goto out; } p->major = major; #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD p->probe = probe; #endif strscpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) { if ((*n)->major == major) break; } if (!*n) *n = p; else ret = -EBUSY; spin_unlock(&major_names_spinlock); if (ret < 0) { printk("register_blkdev: cannot get major %u for %s\n", major, name); kfree(p); } out: mutex_unlock(&major_names_lock); return ret; } EXPORT_SYMBOL(__register_blkdev); void unregister_blkdev(unsigned int major, const char *name) { struct blk_major_name **n; struct blk_major_name *p = NULL; int index = major_to_index(major); mutex_lock(&major_names_lock); spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; if (!*n || strcmp((*n)->name, name)) { WARN_ON(1); } else { p = *n; *n = p->next; } spin_unlock(&major_names_spinlock); mutex_unlock(&major_names_lock); kfree(p); } EXPORT_SYMBOL(unregister_blkdev); int blk_alloc_ext_minor(void) { int idx; idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL); if (idx == -ENOSPC) return -EBUSY; return idx; } void blk_free_ext_minor(unsigned int minor) { ida_free(&ext_devt_ida, minor); } void disk_uevent(struct gendisk *disk, enum kobject_action action) { struct block_device *part; unsigned long idx; rcu_read_lock(); xa_for_each(&disk->part_tbl, idx, part) { if (bdev_is_partition(part) && !bdev_nr_sectors(part)) continue; if (!kobject_get_unless_zero(&part->bd_device.kobj)) continue; rcu_read_unlock(); kobject_uevent(bdev_kobj(part), action); put_device(&part->bd_device); rcu_read_lock(); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(disk_uevent); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) { struct file *file; int ret = 0; if (!disk_has_partscan(disk)) return -EINVAL; if (disk->open_partitions) return -EBUSY; /* * If the device is opened exclusively by current thread already, it's * safe to scan partitons, otherwise, use bd_prepare_to_claim() to * synchronize with other exclusive openers and other partition * scanners. */ if (!(mode & BLK_OPEN_EXCL)) { ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions, NULL); if (ret) return ret; } set_bit(GD_NEED_PART_SCAN, &disk->state); file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL, NULL); if (IS_ERR(file)) ret = PTR_ERR(file); else fput(file); /* * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set, * and this will cause that re-assemble partitioned raid device will * creat partition for underlying disk. */ clear_bit(GD_NEED_PART_SCAN, &disk->state); if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(disk->part0, disk_scan_partitions); return ret; } /** * device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups * * This function registers the partitioning information in @disk * with the kernel. */ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); int ret; /* Only makes sense for bio-based to set ->poll_bio */ if (queue_is_mq(disk->queue) && disk->fops->poll_bio) return -EINVAL; /* * The disk queue should now be all set with enough information about * the device for the elevator code to pick an adequate default * elevator if one is needed, that is, for devices requesting queue * registration. */ elevator_init_mq(disk->queue); /* Mark bdev as having a submit_bio, if needed */ disk->part0->bd_has_submit_bio = disk->fops->submit_bio != NULL; /* * If the driver provides an explicit major number it also must provide * the number of minors numbers supported, and those will be used to * setup the gendisk. * Otherwise just allocate the device numbers for both the whole device * and all partitions from the extended dev_t space. */ ret = -EINVAL; if (disk->major) { if (WARN_ON(!disk->minors)) goto out_exit_elevator; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", DISK_MAX_PARTS); disk->minors = DISK_MAX_PARTS; } if (disk->first_minor > MINORMASK || disk->minors > MINORMASK + 1 || disk->first_minor + disk->minors > MINORMASK + 1) goto out_exit_elevator; } else { if (WARN_ON(disk->minors)) goto out_exit_elevator; ret = blk_alloc_ext_minor(); if (ret < 0) goto out_exit_elevator; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; } /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); ddev->parent = parent; ddev->groups = groups; dev_set_name(ddev, "%s", disk->disk_name); if (!(disk->flags & GENHD_FL_HIDDEN)) ddev->devt = MKDEV(disk->major, disk->first_minor); ret = device_add(ddev); if (ret) goto out_free_ext_minor; ret = disk_alloc_events(disk); if (ret) goto out_device_del; ret = sysfs_create_link(block_depr, &ddev->kobj, kobject_name(&ddev->kobj)); if (ret) goto out_device_del; /* * avoid probable deadlock caused by allocating memory with * GFP_KERNEL in runtime_resume callback of its all ancestor * devices */ pm_runtime_set_memalloc_noio(ddev, true); disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); if (!disk->part0->bd_holder_dir) { ret = -ENOMEM; goto out_del_block_link; } disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); if (!disk->slave_dir) { ret = -ENOMEM; goto out_put_holder_dir; } ret = blk_register_queue(disk); if (ret) goto out_put_slave_dir; if (!(disk->flags & GENHD_FL_HIDDEN)) { ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); if (ret) goto out_unregister_queue; bdi_set_owner(disk->bdi, ddev); ret = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj, "bdi"); if (ret) goto out_unregister_bdi; /* Make sure the first partition scan will be proceed */ if (get_capacity(disk) && disk_has_partscan(disk)) set_bit(GD_NEED_PART_SCAN, &disk->state); bdev_add(disk->part0, ddev->devt); if (get_capacity(disk)) disk_scan_partitions(disk, BLK_OPEN_READ); /* * Announce the disk and partitions after all partitions are * created. (for hidden disks uevents remain suppressed forever) */ dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); } else { /* * Even if the block_device for a hidden gendisk is not * registered, it needs to have a valid bd_dev so that the * freeing of the dynamic major works. */ disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } disk_update_readahead(disk); disk_add_events(disk); set_bit(GD_ADDED, &disk->state); return 0; out_unregister_bdi: if (!(disk->flags & GENHD_FL_HIDDEN)) bdi_unregister(disk->bdi); out_unregister_queue: blk_unregister_queue(disk); rq_qos_exit(disk->queue); out_put_slave_dir: kobject_put(disk->slave_dir); disk->slave_dir = NULL; out_put_holder_dir: kobject_put(disk->part0->bd_holder_dir); out_del_block_link: sysfs_remove_link(block_depr, dev_name(ddev)); pm_runtime_set_memalloc_noio(ddev, false); out_device_del: device_del(ddev); out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); out_exit_elevator: if (disk->queue->elevator) elevator_exit(disk->queue); return ret; } EXPORT_SYMBOL(device_add_disk); static void blk_report_disk_dead(struct gendisk *disk, bool surprise) { struct block_device *bdev; unsigned long idx; /* * On surprise disk removal, bdev_mark_dead() may call into file * systems below. Make it clear that we're expecting to not hold * disk->open_mutex. */ lockdep_assert_not_held(&disk->open_mutex); rcu_read_lock(); xa_for_each(&disk->part_tbl, idx, bdev) { if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) continue; rcu_read_unlock(); bdev_mark_dead(bdev, surprise); put_device(&bdev->bd_device); rcu_read_lock(); } rcu_read_unlock(); } static void __blk_mark_disk_dead(struct gendisk *disk) { /* * Fail any new I/O. */ if (test_and_set_bit(GD_DEAD, &disk->state)) return; if (test_bit(GD_OWNS_QUEUE, &disk->state)) blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); /* * Stop buffered writers from dirtying pages that can't be written out. */ set_capacity(disk, 0); /* * Prevent new I/O from crossing bio_queue_enter(). */ blk_queue_start_drain(disk->queue); } /** * blk_mark_disk_dead - mark a disk as dead * @disk: disk to mark as dead * * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O * to this disk. */ void blk_mark_disk_dead(struct gendisk *disk) { __blk_mark_disk_dead(disk); blk_report_disk_dead(disk, true); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove * * Removes the gendisk and all its associated resources. This deletes the * partitions associated with the gendisk, and unregisters the associated * request_queue. * * This is the counter to the respective __device_add_disk() call. * * The final removal of the struct gendisk happens when its refcount reaches 0 * with put_disk(), which should be called after del_gendisk(), if * __device_add_disk() was used. * * Drivers exist which depend on the release of the gendisk to be synchronous, * it should not be deferred. * * Context: can sleep */ void del_gendisk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct block_device *part; unsigned long idx; might_sleep(); if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) return; disk_del_events(disk); /* * Prevent new openers by unlinked the bdev inode. */ mutex_lock(&disk->open_mutex); xa_for_each(&disk->part_tbl, idx, part) remove_inode_hash(part->bd_inode); mutex_unlock(&disk->open_mutex); /* * Tell the file system to write back all dirty data and shut down if * it hasn't been notified earlier. */ if (!test_bit(GD_DEAD, &disk->state)) blk_report_disk_dead(disk, false); __blk_mark_disk_dead(disk); /* * Drop all partitions now that the disk is marked dead. */ mutex_lock(&disk->open_mutex); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); /* * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ bdi_unregister(disk->bdi); } blk_unregister_queue(disk); kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); disk->slave_dir = NULL; part_stat_set_all(disk->part0, 0); disk->part0->bd_stamp = 0; sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); blk_mq_freeze_queue_wait(q); blk_throtl_cancel_bios(disk); blk_sync_queue(q); blk_flush_integrity(); if (queue_is_mq(q)) blk_mq_cancel_work_sync(q); blk_mq_quiesce_queue(q); if (q->elevator) { mutex_lock(&q->sysfs_lock); elevator_exit(q); mutex_unlock(&q->sysfs_lock); } rq_qos_exit(q); blk_mq_unquiesce_queue(q); /* * If the disk does not own the queue, allow using passthrough requests * again. Else leave the queue frozen to fail all I/O. */ if (!test_bit(GD_OWNS_QUEUE, &disk->state)) { blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); __blk_mq_unfreeze_queue(q, true); } else { if (queue_is_mq(q)) blk_mq_exit_queue(q); } } EXPORT_SYMBOL(del_gendisk); /** * invalidate_disk - invalidate the disk * @disk: the struct gendisk to invalidate * * A helper to invalidates the disk. It will clean the disk's associated * buffer/page caches and reset its internal states so that the disk * can be reused by the drivers. * * Context: can sleep */ void invalidate_disk(struct gendisk *disk) { struct block_device *bdev = disk->part0; invalidate_bdev(bdev); bdev->bd_inode->i_mapping->wb_err = 0; set_capacity(disk, 0); } EXPORT_SYMBOL(invalidate_disk); /* sysfs access to bad-blocks list. */ static ssize_t disk_badblocks_show(struct device *dev, struct device_attribute *attr, char *page) { struct gendisk *disk = dev_to_disk(dev); if (!disk->bb) return sprintf(page, "\n"); return badblocks_show(disk->bb, page, 0); } static ssize_t disk_badblocks_store(struct device *dev, struct device_attribute *attr, const char *page, size_t len) { struct gendisk *disk = dev_to_disk(dev); if (!disk->bb) return -ENXIO; return badblocks_store(disk->bb, page, len, 0); } #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD void blk_request_module(dev_t devt) { unsigned int major = MAJOR(devt); struct blk_major_name **n; mutex_lock(&major_names_lock); for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) { if ((*n)->major == major && (*n)->probe) { (*n)->probe(devt); mutex_unlock(&major_names_lock); return; } } mutex_unlock(&major_names_lock); if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) /* Make old-style 2.4 aliases work */ request_module("block-major-%d", MAJOR(devt)); } #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ #ifdef CONFIG_PROC_FS /* iterator */ static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) { loff_t skip = *pos; struct class_dev_iter *iter; struct device *dev; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return ERR_PTR(-ENOMEM); seqf->private = iter; class_dev_iter_init(iter, &block_class, NULL, &disk_type); do { dev = class_dev_iter_next(iter); if (!dev) return NULL; } while (skip--); return dev_to_disk(dev); } static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos) { struct device *dev; (*pos)++; dev = class_dev_iter_next(seqf->private); if (dev) return dev_to_disk(dev); return NULL; } static void disk_seqf_stop(struct seq_file *seqf, void *v) { struct class_dev_iter *iter = seqf->private; /* stop is called even after start failed :-( */ if (iter) { class_dev_iter_exit(iter); kfree(iter); seqf->private = NULL; } } static void *show_partition_start(struct seq_file *seqf, loff_t *pos) { void *p; p = disk_seqf_start(seqf, pos); if (!IS_ERR_OR_NULL(p) && !*pos) seq_puts(seqf, "major minor #blocks name\n\n"); return p; } static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; struct block_device *part; unsigned long idx; if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN)) return 0; rcu_read_lock(); xa_for_each(&sgp->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; seq_printf(seqf, "%4d %7d %10llu %pg\n", MAJOR(part->bd_dev), MINOR(part->bd_dev), bdev_nr_sectors(part) >> 1, part); } rcu_read_unlock(); return 0; } static const struct seq_operations partitions_op = { .start = show_partition_start, .next = disk_seqf_next, .stop = disk_seqf_stop, .show = show_partition }; #endif static int __init genhd_device_init(void) { int error; error = class_register(&block_class); if (unlikely(error)) return error; blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); /* create top-level block dir */ block_depr = kobject_create_and_add("block", NULL); return 0; } subsys_initcall(genhd_device_init); static ssize_t disk_range_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sprintf(buf, "%d\n", disk->minors); } static ssize_t disk_ext_range_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sprintf(buf, "%d\n", (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); } static ssize_t disk_removable_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sprintf(buf, "%d\n", (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } static ssize_t disk_hidden_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sprintf(buf, "%d\n", (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); } static ssize_t disk_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct disk_stats stat; unsigned int inflight; inflight = part_in_flight(bdev); if (inflight) { part_stat_lock(); update_io_ticks(bdev, jiffies, true); part_stat_unlock(); } part_stat_read_all(bdev, &stat); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " "%8u %8u %8u " "%8lu %8lu %8llu %8u " "%8lu %8u" "\n", stat.ios[STAT_READ], stat.merges[STAT_READ], (unsigned long long)stat.sectors[STAT_READ], (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC), stat.ios[STAT_WRITE], stat.merges[STAT_WRITE], (unsigned long long)stat.sectors[STAT_WRITE], (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), inflight, jiffies_to_msecs(stat.io_ticks), (unsigned int)div_u64(stat.nsecs[STAT_READ] + stat.nsecs[STAT_WRITE] + stat.nsecs[STAT_DISCARD] + stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC), stat.ios[STAT_DISCARD], stat.merges[STAT_DISCARD], (unsigned long long)stat.sectors[STAT_DISCARD], (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), stat.ios[STAT_FLUSH], (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); unsigned int inflight[2]; if (queue_is_mq(q)) blk_mq_in_flight_rw(q, bdev, inflight); else part_in_flight_rw(bdev, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the capability attribute has been deprecated.\n"); return sprintf(buf, "0\n"); } static ssize_t disk_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t disk_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t diskseq_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sprintf(buf, "%llu\n", disk->diskseq); } static ssize_t partscan_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); } static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL); static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL); static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail); } ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) dev_to_bdev(dev)->bd_make_it_fail = i; return count; } static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); #endif /* CONFIG_FAIL_MAKE_REQUEST */ #ifdef CONFIG_FAIL_IO_TIMEOUT static struct device_attribute dev_attr_fail_timeout = __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store); #endif static struct attribute *disk_attrs[] = { &dev_attr_range.attr, &dev_attr_ext_range.attr, &dev_attr_removable.attr, &dev_attr_hidden.attr, &dev_attr_ro.attr, &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_discard_alignment.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, &dev_attr_badblocks.attr, &dev_attr_events.attr, &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, &dev_attr_diskseq.attr, &dev_attr_partscan.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif #ifdef CONFIG_FAIL_IO_TIMEOUT &dev_attr_fail_timeout.attr, #endif NULL }; static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, typeof(*dev), kobj); struct gendisk *disk = dev_to_disk(dev); if (a == &dev_attr_badblocks.attr && !disk->bb) return 0; return a->mode; } static struct attribute_group disk_attr_group = { .attrs = disk_attrs, .is_visible = disk_visible, }; static const struct attribute_group *disk_attr_groups[] = { &disk_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, #endif #ifdef CONFIG_BLK_DEV_INTEGRITY &blk_integrity_attr_group, #endif NULL }; /** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk * * This function releases all allocated resources of the gendisk. * * Drivers which used __device_add_disk() have a gendisk with a request_queue * assigned. Since the request_queue sits on top of the gendisk for these * drivers we also call blk_put_queue() for them, and we expect the * request_queue refcount to reach 0 at this point, and so the request_queue * will also be freed prior to the disk. * * Context: can sleep */ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); might_sleep(); WARN_ON_ONCE(disk_live(disk)); blk_trace_remove(disk->queue); /* * To undo the all initialization from blk_mq_init_allocated_queue in * case of a probe failure where add_disk is never called we have to * call blk_mq_exit_queue here. We can't do this for the more common * teardown case (yet) as the tagset can be gone by the time the disk * is released once it was added. */ if (queue_is_mq(disk->queue) && test_bit(GD_OWNS_QUEUE, &disk->state) && !test_bit(GD_ADDED, &disk->state)) blk_mq_exit_queue(disk->queue); blkcg_exit_disk(disk); bioset_exit(&disk->bio_split); disk_release_events(disk); kfree(disk->random); disk_free_zone_resources(disk); xa_destroy(&disk->part_tbl); disk->queue->disk = NULL; blk_put_queue(disk->queue); if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk) disk->fops->free_disk(disk); iput(disk->part0->bd_inode); /* frees the disk */ } static int block_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct gendisk *disk = dev_to_disk(dev); return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); } const struct class block_class = { .name = "block", .dev_uevent = block_uevent, }; static char *block_devnode(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid) { struct gendisk *disk = dev_to_disk(dev); if (disk->fops->devnode) return disk->fops->devnode(disk, mode); return NULL; } const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, .devnode = block_devnode, }; #ifdef CONFIG_PROC_FS /* * aggregate disk stat collector. Uses the same stats that the sysfs * entries do, above, but makes them available through one seq_file. * * The output looks suspiciously like /proc/partitions with a bunch of * extra fields. */ static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; struct block_device *hd; unsigned int inflight; struct disk_stats stat; unsigned long idx; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) seq_puts(seqf, "major minor name" " rio rmerge rsect ruse wio wmerge " "wsect wuse running use aveq" "\n\n"); */ rcu_read_lock(); xa_for_each(&gp->part_tbl, idx, hd) { if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; inflight = part_in_flight(hd); if (inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true); part_stat_unlock(); } part_stat_read_all(hd, &stat); seq_printf(seqf, "%4d %7d %pg " "%lu %lu %lu %u " "%lu %lu %lu %u " "%u %u %u " "%lu %lu %lu %u " "%lu %u" "\n", MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd, stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC), stat.ios[STAT_WRITE], stat.merges[STAT_WRITE], stat.sectors[STAT_WRITE], (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), inflight, jiffies_to_msecs(stat.io_ticks), (unsigned int)div_u64(stat.nsecs[STAT_READ] + stat.nsecs[STAT_WRITE] + stat.nsecs[STAT_DISCARD] + stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC), stat.ios[STAT_DISCARD], stat.merges[STAT_DISCARD], stat.sectors[STAT_DISCARD], (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), stat.ios[STAT_FLUSH], (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC) ); } rcu_read_unlock(); return 0; } static const struct seq_operations diskstats_op = { .start = disk_seqf_start, .next = disk_seqf_next, .stop = disk_seqf_stop, .show = diskstats_show }; static int __init proc_genhd_init(void) { proc_create_seq("diskstats", 0, NULL, &diskstats_op); proc_create_seq("partitions", 0, NULL, &partitions_op); return 0; } module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ dev_t part_devt(struct gendisk *disk, u8 partno) { struct block_device *part; dev_t devt = 0; rcu_read_lock(); part = xa_load(&disk->part_tbl, partno); if (part) devt = part->bd_dev; rcu_read_unlock(); return devt; } struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass) { struct gendisk *disk; disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) return NULL; if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0)) goto out_free_disk; disk->bdi = bdi_alloc(node_id); if (!disk->bdi) goto out_free_bioset; /* bdev_alloc() might need the queue, set before the first call */ disk->queue = q; disk->part0 = bdev_alloc(disk, 0); if (!disk->part0) goto out_free_bdi; disk->node_id = node_id; mutex_init(&disk->open_mutex); xa_init(&disk->part_tbl); if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; if (blkcg_init_disk(disk)) goto out_erase_part0; disk_init_zone_resources(disk); rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); q->disk = disk; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); #endif return disk; out_erase_part0: xa_erase(&disk->part_tbl, 0); out_destroy_part_tbl: xa_destroy(&disk->part_tbl); disk->part0->bd_disk = NULL; iput(disk->part0->bd_inode); out_free_bdi: bdi_put(disk->bdi); out_free_bioset: bioset_exit(&disk->bio_split); out_free_disk: kfree(disk); return NULL; } struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, struct lock_class_key *lkclass) { struct queue_limits default_lim = { }; struct request_queue *q; struct gendisk *disk; q = blk_alloc_queue(lim ? lim : &default_lim, node); if (IS_ERR(q)) return ERR_CAST(q); disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; } EXPORT_SYMBOL(__blk_alloc_disk); /** * put_disk - decrements the gendisk refcount * @disk: the struct gendisk to decrement the refcount for * * This decrements the refcount for the struct gendisk. When this reaches 0 * we'll have disk_release() called. * * Note: for blk-mq disk put_disk must be called before freeing the tag_set * when handling probe errors (that is before add_disk() is called). * * Context: Any context, but the last reference must not be dropped from * atomic context. */ void put_disk(struct gendisk *disk) { if (disk) put_device(disk_to_dev(disk)); } EXPORT_SYMBOL(put_disk); static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; char *envp[] = { event, NULL }; if (!ro) event[8] = '0'; kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); } /** * set_disk_ro - set a gendisk read-only * @disk: gendisk to operate on * @read_only: %true to set the disk read-only, %false set the disk read/write * * This function is used to indicate whether a given disk device should have its * read-only flag set. set_disk_ro() is typically used by device drivers to * indicate whether the underlying physical device is write-protected. */ void set_disk_ro(struct gendisk *disk, bool read_only) { if (read_only) { if (test_and_set_bit(GD_READ_ONLY, &disk->state)) return; } else { if (!test_and_clear_bit(GD_READ_ONLY, &disk->state)) return; } set_disk_ro_uevent(disk, read_only); } EXPORT_SYMBOL(set_disk_ro); void inc_diskseq(struct gendisk *disk) { disk->diskseq = atomic64_inc_return(&diskseq); } |
5 5 1 8 6 73 5 7 50 5 5 5 5 5 7 3 13 17 511 506 8 509 508 511 510 196 511 8 511 510 475 30 36 34 34 33 31 30 29 27 26 24 24 22 1 21 20 17 2 15 2 6 5 1 119 98 20 1 3 93 22 1 5 9 101 106 6 103 3 5 106 2 3 103 3 5 106 4 103 7 105 6 110 107 3 103 7 1 5 1 102 2 1 5 2 97 92 11 87 16 6 81 15 7 15 9 8 4 1 2 74 1 41 2 1 14 41 15 6 12 40 2 50 6 50 3 3 2 44 5 40 11 47 3 50 55 1 30 2 1 15 36 2 7 9 8 1 8 1 1 1 8 7 50 64 59 5 57 4 2 60 2 2 23 55 60 4 64 5 59 7 61 4 61 3 61 3 59 5 59 5 59 60 4 64 8 1 8 5 1 5 3 8 2 1 21 12 10 7 3 3 7 2 5 5 57 57 57 906 906 905 422 421 422 1545 1547 1535 886 886 20 20 20 402 402 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 | // SPDX-License-Identifier: GPL-2.0-only /* * net/core/fib_rules.c Generic Routing Rules * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> #include <linux/indirect_call_wrapper.h> #if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES) #ifdef CONFIG_IP_MULTIPLE_TABLES #define INDIRECT_CALL_MT(f, f2, f1, ...) \ INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #elif defined(CONFIG_IP_MULTIPLE_TABLES) #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) #endif static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(0), KUIDT_INIT(~0), }; bool fib_rule_matchall(const struct fib_rule *rule) { if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; if (fib_rule_port_range_set(&rule->sport_range)) return false; if (fib_rule_port_range_set(&rule->dport_range)) return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table) { struct fib_rule *r; r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; refcount_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; /* The lock is not required here, the list in unreacheable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; } EXPORT_SYMBOL(fib_default_rule_add); static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; if (!list_empty(&ops->rules_list)) { pos = ops->rules_list.next; if (pos->next != &ops->rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; } } return 0; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) { struct fib_rules_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (ops->family == family) { if (!try_module_get(ops->owner)) ops = NULL; rcu_read_unlock(); return ops; } } rcu_read_unlock(); return NULL; } static void rules_ops_put(struct fib_rules_ops *ops) { if (ops) module_put(ops->owner); } static void flush_route_cache(struct fib_rules_ops *ops) { if (ops->flush_cache) ops->flush_cache(ops); } static int __fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; struct net *net; net = ops->fro_net; if (ops->rule_size < sizeof(struct fib_rule)) return -EINVAL; if (ops->match == NULL || ops->configure == NULL || ops->compare == NULL || ops->fill == NULL || ops->action == NULL) return -EINVAL; spin_lock(&net->rules_mod_lock); list_for_each_entry(o, &net->rules_ops, list) if (ops->family == o->family) goto errout; list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: spin_unlock(&net->rules_mod_lock); return err; } struct fib_rules_ops * fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) { struct fib_rules_ops *ops; int err; ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&ops->rules_list); ops->fro_net = net; err = __fib_rules_register(ops); if (err) { kfree(ops); ops = ERR_PTR(err); } return ops; } EXPORT_SYMBOL_GPL(fib_rules_register); static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); if (ops->delete) ops->delete(rule); fib_rule_put(rule); } } void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); spin_unlock(&net->rules_mod_lock); fib_rules_cleanup_ops(ops); kfree_rcu(ops, rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); static int uid_range_set(struct fib_kuid_range *range) { return uid_valid(range->start) && uid_valid(range->end); } static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) { struct fib_rule_uid_range *in; struct fib_kuid_range out; in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); out.start = make_kuid(current_user_ns(), in->start); out.end = make_kuid(current_user_ns(), in->end); return out; } static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) { struct fib_rule_uid_range out = { from_kuid_munged(current_user_ns(), range->start), from_kuid_munged(current_user_ns(), range->end) }; return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } static int nla_get_port_range(struct nlattr *pattr, struct fib_rule_port_range *port_range) { const struct fib_rule_port_range *pr = nla_data(pattr); if (!fib_rule_port_range_valid(pr)) return -EINVAL; port_range->start = pr->start; port_range->end = pr->end; return 0; } static int nla_put_port_range(struct sk_buff *skb, int attrtype, struct fib_rule_port_range *range) { return nla_put(skb, attrtype, sizeof(*range), range); } static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { int ret = 0; if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) goto out; if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; if (uid_lt(fl->flowi_uid, rule->uid_range.start) || uid_gt(fl->flowi_uid, rule->uid_range.end)) goto out; ret = INDIRECT_CALL_MT(ops->match, fib6_rule_match, fib4_rule_match, rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { struct fib_rule *rule; int err; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { struct fib_rule *target; target = rcu_dereference(rule->ctarget); if (target == NULL) { continue; } else { rule = target; goto jumped; } } else if (rule->action == FR_ACT_NOP) continue; else err = INDIRECT_CALL_MT(ops->action, fib6_rule_action, fib4_rule_action, rule, fl, flags, arg); if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, fib6_rule_suppress, fib4_rule_suppress, rule, flags, arg)) continue; if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(refcount_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } break; } } err = -ESRCH; out: rcu_read_unlock(); return err; } EXPORT_SYMBOL_GPL(fib_rules_lookup); static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_rule *rule, int family, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, .info.extack = extack, .rule = rule, }; return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, enum fib_event_type event_type, struct fib_rule *rule, struct fib_rules_ops *ops, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = ops->family, .info.extack = extack, .rule = rule, }; ops->fib_rules_seq++; return call_fib_notifiers(net, event_type, &info.info); } /* Called with rcu_read_lock() */ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; list_for_each_entry_rcu(rule, &ops->rules_list, list) { err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family, extack); if (err) break; } rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); unsigned int fib_rules_seq_read(struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; ASSERT_RTNL(); ops = lookup_rules_ops(net, family); if (!ops) return 0; fib_rules_seq = ops->fib_rules_seq; rules_ops_put(ops); return fib_rules_seq; } EXPORT_SYMBOL_GPL(fib_rules_seq_read); static struct fib_rule *rule_find(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule, bool user_priority) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (rule->action && r->action != rule->action) continue; if (rule->table && r->table != rule->table) continue; if (user_priority && r->pref != rule->pref) continue; if (rule->iifname[0] && memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (rule->oifname[0] && memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (rule->mark && r->mark != rule->mark) continue; if (rule->suppress_ifgroup != -1 && r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (rule->suppress_prefixlen != -1 && r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; if (rule->tun_id && r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; if (uid_range_set(&rule->uid_range) && (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end))) continue; if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; if (rule->proto && r->proto != rule->proto) continue; if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return r; } return NULL; } #ifdef CONFIG_NET_L3_MASTER_DEV static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { nlrule->l3mdev = nla_get_u8(nla); if (nlrule->l3mdev != 1) { NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); return -1; } return 0; } #else static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); return -1; } #endif static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; if (frh->src_len) if (!tb[FRA_SRC] || frh->src_len > (ops->addr_size * 8) || nla_len(tb[FRA_SRC]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid source address"); goto errout; } if (frh->dst_len) if (!tb[FRA_DST] || frh->dst_len > (ops->addr_size * 8) || nla_len(tb[FRA_DST]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid dst address"); goto errout; } nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; } refcount_set(&nlrule->refcnt, 1); nlrule->fr_net = net; if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; } else { nlrule->pref = fib_default_rule_pref(ops); } nlrule->proto = tb[FRA_PROTOCOL] ? nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; if (tb[FRA_IIFNAME]) { struct net_device *dev; nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->iifname); if (dev) nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { struct net_device *dev; nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->oifname); if (dev) nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); if (nlrule->mark) /* compatibility: if the mark value is non-zero all bits * are compared unless a mask is explicitly specified. */ nlrule->mark_mask = 0xFFFFFFFF; } if (tb[FRA_FWMASK]) nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; nlrule->action = frh->action; nlrule->flags = frh->flags; nlrule->table = frh_get_table(frh, tb); if (tb[FRA_SUPPRESS_PREFIXLEN]) nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); else nlrule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); else nlrule->suppress_ifgroup = -1; if (tb[FRA_GOTO]) { if (nlrule->action != FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; } nlrule->target = nla_get_u32(tb[FRA_GOTO]); /* Backward jumps are prohibited to avoid endless loops */ if (nlrule->target <= nlrule->pref) { NL_SET_ERR_MSG(extack, "Backward goto not supported"); goto errout_free; } } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; } if (nlrule->l3mdev && nlrule->table) { NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); goto errout_free; } if (tb[FRA_UID_RANGE]) { if (current_user_ns() != net->user_ns) { err = -EPERM; NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; } nlrule->uid_range = nla_get_kuid_range(tb); if (!uid_range_set(&nlrule->uid_range) || !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { NL_SET_ERR_MSG(extack, "Invalid uid range"); goto errout_free; } } else { nlrule->uid_range = fib_kuid_range_unset; } if (tb[FRA_IP_PROTO]) nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); if (tb[FRA_SPORT_RANGE]) { err = nla_get_port_range(tb[FRA_SPORT_RANGE], &nlrule->sport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } } if (tb[FRA_DPORT_RANGE]) { err = nla_get_port_range(tb[FRA_DPORT_RANGE], &nlrule->dport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } } *rule = nlrule; return 0; errout_free: kfree(nlrule); errout: return err; } static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (r->action != rule->action) continue; if (r->table != rule->table) continue; if (r->pref != rule->pref) continue; if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (r->mark != rule->mark) continue; if (r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (r->mark_mask != rule->mark_mask) continue; if (r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (r->l3mdev != rule->l3mdev) continue; if (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; if (r->ip_proto != rule->ip_proto) continue; if (r->proto != rule->proto) continue; if (!fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return 1; } return 0; } static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_PRIORITY] = { .type = NLA_U32 }, [FRA_FWMARK] = { .type = NLA_U32 }, [FRA_FLOW] = { .type = NLA_U32 }, [FRA_TUN_ID] = { .type = NLA_U64 }, [FRA_FWMASK] = { .type = NLA_U32 }, [FRA_TABLE] = { .type = NLA_U32 }, [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, [FRA_GOTO] = { .type = NLA_U32 }, [FRA_L3MDEV] = { .type = NLA_U8 }, [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, [FRA_PROTOCOL] = { .type = NLA_U8 }, [FRA_IP_PROTO] = { .type = NLA_U8 }, [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) } }; int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *last = NULL; struct nlattr *tb[FRA_MAX + 1]; int err = -EINVAL, unresolved = 0; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (!ops) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; goto errout_free; } err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); if (err < 0) goto errout_free; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { RCU_INIT_POINTER(rule->ctarget, r); break; } } if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; } if (last) list_add_rcu(&rule->list, &last->list); else list_add_rcu(&rule->list, &ops->rules_list); if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if * any of them are pointing to this new rule. */ list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref && rtnl_dereference(r->ctarget) == NULL) { rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; } } } if (rule->action == FR_ACT_GOTO) ops->nr_goto_rules++; if (unresolved) ops->unresolved_rules++; if (rule->tun_id) ip_tunnel_need_metadata(); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: kfree(rule); errout: rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_newrule); int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; int err = -EINVAL; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; goto errout; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; goto errout; } if (ops->delete) { err = ops->delete(rule); if (err) goto errout; } if (rule->tun_id) ip_tunnel_unneed_metadata(); list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { ops->nr_goto_rules--; if (rtnl_dereference(rule->ctarget) == NULL) ops->unresolved_rules--; } /* * Check if this rule is a target to any of them. If so, * adjust to the next one with the same preference or * disable them. As this operation is eventually very * expensive, it is only performed if goto rules, except * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { struct fib_rule *n; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) n = NULL; list_for_each_entry(r, &ops->rules_list, list) { if (rtnl_dereference(r->ctarget) != rule) continue; rcu_assign_pointer(r->ctarget, n); if (!n) ops->unresolved_rules++; } } call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; errout: kfree(nlrule); rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_delrule); static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ + nla_total_size(sizeof(struct fib_kuid_range)) + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); return payload; } static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, u32 pid, u32 seq, int type, int flags, struct fib_rules_ops *ops) { struct nlmsghdr *nlh; struct fib_rule_hdr *frh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags); if (nlh == NULL) return -EMSGSIZE; frh = nlmsg_data(nlh); frh->family = ops->family; frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) goto nla_put_failure; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; if (rule->iifindex == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; if (rule->oifindex == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } if ((rule->pref && nla_put_u32(skb, FRA_PRIORITY, rule->pref)) || (rule->mark && nla_put_u32(skb, FRA_FWMARK, rule->mark)) || ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) goto nla_put_failure; } if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, struct fib_rules_ops *ops) { int idx = 0; struct fib_rule *rule; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWRULE, NLM_F_MULTI, ops); if (err) break; skip: idx++; } rcu_read_unlock(); cb->args[1] = idx; rules_ops_put(ops); return err; } static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib_rule_hdr *frh; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for fib rule dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); return -EINVAL; } return 0; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; int idx = 0, family; if (cb->strict_check) { int err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; } family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); if (ops == NULL) return -EAFNOSUPPORT; dump_rules(skb, cb, ops); return skb->len; } rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; if (dump_rules(skb, cb, ops) < 0) break; cb->args[1] = 0; skip: idx++; } rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid) { struct net *net; struct sk_buff *skb; int err = -ENOMEM; net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); if (skb == NULL) goto errout; err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops); if (err < 0) { /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: if (err < 0) rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) rule->iifindex = dev->ifindex; if (rule->oifindex == -1 && strcmp(dev->name, rule->oifname) == 0) rule->oifindex = dev->ifindex; } } static void detach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) rule->iifindex = -1; if (rule->oifindex == dev->ifindex) rule->oifindex = -1; } } static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct fib_rules_ops *ops; ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_CHANGENAME: list_for_each_entry(ops, &net->rules_ops, list) { detach_rules(&ops->rules_list, dev); attach_rules(&ops->rules_list, dev); } break; case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; } return NOTIFY_DONE; } static struct notifier_block fib_rules_notifier = { .notifier_call = fib_rules_event, }; static int __net_init fib_rules_net_init(struct net *net) { INIT_LIST_HEAD(&net->rules_ops); spin_lock_init(&net->rules_mod_lock); return 0; } static void __net_exit fib_rules_net_exit(struct net *net) { WARN_ON_ONCE(!list_empty(&net->rules_ops)); } static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, .exit = fib_rules_net_exit, }; static int __init fib_rules_init(void) { int err; rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) goto fail; err = register_netdevice_notifier(&fib_rules_notifier); if (err < 0) goto fail_unregister; return 0; fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); rtnl_unregister(PF_UNSPEC, RTM_DELRULE); rtnl_unregister(PF_UNSPEC, RTM_GETRULE); return err; } subsys_initcall(fib_rules_init); |
1205 2996 700 699 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 | /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/backing-dev.h * * low-level device information and state which is propagated up through * to high-level code. */ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H #include <linux/kernel.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/device.h> #include <linux/writeback.h> #include <linux/backing-dev-defs.h> #include <linux/slab.h> static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); return bdi; } struct backing_dev_info *bdi_get_by_id(u64 id); void bdi_put(struct backing_dev_info *bdi); __printf(2, 3) int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); __printf(2, 0) int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wait_for_completion(struct wb_completion *done); extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { return test_bit(WB_has_dirty_io, &wb->state); } static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) { /* * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are * any dirty wbs. See wb_update_write_bandwidth(). */ return atomic_long_read(&bdi->tot_write_bandwidth); } static inline void wb_stat_mod(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, 1); } static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, -1); } static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_read_positive(&wb->stat[item]); } static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_sum_positive(&wb->stat[item]); } extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ static inline unsigned long wb_stat_error(void) { #ifdef CONFIG_SMP return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif } /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability * * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages * should contribute to accounting * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold */ #define BDI_CAP_WRITEBACK (1 << 0) #define BDI_CAP_WRITEBACK_ACCT (1 << 1) #define BDI_CAP_STRICTLIMIT (1 << 2) extern struct backing_dev_info noop_backing_dev_info; int bdi_init(struct backing_dev_info *bdi); /** * writeback_in_progress - determine whether there is writeback in progress * @wb: bdi_writeback of interest * * Determine whether there is writeback waiting to be handled against a * bdi_writeback. */ static inline bool writeback_in_progress(struct bdi_writeback *wb) { return test_bit(WB_writeback_running, &wb->state); } struct backing_dev_info *inode_to_bdi(struct inode *inode); static inline bool mapping_can_writeback(struct address_space *mapping) { return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct cgroup_subsys_state *css); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest * * Cgroup writeback requires support from the filesystem. Also, both memcg and * iocg have to be on the default hierarchy. Test whether all conditions are * met. * * Note that the test result may change dynamically on the same inode * depending on how memcg and iocg are configured. */ static inline bool inode_cgwb_enabled(struct inode *inode) { struct backing_dev_info *bdi = inode_to_bdi(inode); return cgroup_subsys_on_dfl(memory_cgrp_subsys) && cgroup_subsys_on_dfl(io_cgrp_subsys) && (bdi->capabilities & BDI_CAP_WRITEBACK) && (inode->i_sb->s_iflags & SB_I_CGROUPWB); } /** * wb_find_current - find wb for %current on a bdi * @bdi: bdi of interest * * Find the wb of @bdi which matches both the memcg and blkcg of %current. * Must be called under rcu_read_lock() which protects the returend wb. * NULL if not found. */ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; memcg_css = task_css(current, memory_cgrp_id); if (!memcg_css->parent) return &bdi->wb; wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); /* * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } /** * wb_get_create_current - get or create wb for %current on a bdi * @bdi: bdi of interest * @gfp: allocation mask * * Equivalent to wb_get_create() on %current's memcg. This function is * called from a relatively hot path and optimizes the common cases using * wb_find_current(). */ static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { struct bdi_writeback *wb; rcu_read_lock(); wb = wb_find_current(bdi); if (wb && unlikely(!wb_tryget(wb))) wb = NULL; rcu_read_unlock(); if (unlikely(!wb)) { struct cgroup_subsys_state *memcg_css; memcg_css = task_get_css(current, memory_cgrp_id); wb = wb_get_create(bdi, memcg_css, gfp); css_put(memcg_css); } return wb; } /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest * * Returns the wb @inode is currently associated with. The caller must be * holding either @inode->i_lock, the i_pages lock, or the * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&inode->i_lock) && !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); #endif return inode->i_wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { /* * If wbc does not have inode attached, it means cgroup writeback was * disabled when wbc started. Just use the default wb in that case. */ return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; } /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, the i_pages lock or wb->list_lock. This * function determines the wb associated with @inode and ensures that the * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and * can't sleep during the transaction. IRQs may or may not be disabled on * return. */ static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); /* * Paired with store_release in inode_switch_wbs_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; if (unlikely(cookie->locked)) xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages * lock. inode_to_wb() will bark. Deref directly. */ return inode->i_wb; } /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode * @cookie: @cookie from unlocked_inode_to_wb_begin() */ static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { if (unlikely(cookie->locked)) xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags); rcu_read_unlock(); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) { return false; } static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; } static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { return &bdi->wb; } static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { return inode_to_wb(inode); } static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { } static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } static inline void wb_blkcg_offline(struct cgroup_subsys_state *css) { } #endif /* CONFIG_CGROUP_WRITEBACK */ const char *bdi_dev_name(struct backing_dev_info *bdi); #endif /* _LINUX_BACKING_DEV_H */ |
1 1 1 1 11 11 11 11 73 10 74 1 3 11 16 14 2 15 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 | // SPDX-License-Identifier: GPL-2.0 #include <linux/anon_inodes.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/magic.h> #include <linux/mount.h> #include <linux/pid.h> #include <linux/pidfs.h> #include <linux/pid_namespace.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/pseudo_fs.h> #include <linux/seq_file.h> #include <uapi/linux/pidfd.h> #include "internal.h" #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd * @m: proc fdinfo file * @f: file referencing a pidfd * * Pid: * This function will print the pid that a given pidfd refers to in the * pid namespace of the procfs instance. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its pid. This is * similar to calling getppid() on a process whose parent is outside of * its pid namespace. * * NSpid: * If pid namespaces are supported then this function will also print * the pid of a given pidfd refers to for all descendant pid namespaces * starting from the current pid namespace of the instance, i.e. the * Pid field and the first entry in the NSpid field will be identical. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its first NSpid * entry and no others will be shown. * Note that this differs from the Pid and NSpid fields in * /proc/<pid>/status where Pid and NSpid are always shown relative to * the pid namespace of the procfs instance. The difference becomes * obvious when sending around a pidfd between pid namespaces from a * different branch of the tree, i.e. where no ancestral relation is * present between the pid namespaces: * - create two new pid namespaces ns1 and ns2 in the initial pid * namespace (also take care to create new mount namespaces in the * new pid namespace and mount procfs) * - create a process with a pidfd in ns1 * - send pidfd from ns1 to ns2 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid * have exactly one entry, which is 0 */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { struct pid *pid = pidfd_pid(f); struct pid_namespace *ns; pid_t nr = -1; if (likely(pid_has_task(pid, PIDTYPE_PID))) { ns = proc_pid_ns(file_inode(m->file)->i_sb); nr = pid_nr_ns(pid, ns); } seq_put_decimal_ll(m, "Pid:\t", nr); #ifdef CONFIG_PID_NS seq_put_decimal_ll(m, "\nNSpid:\t", nr); if (nr > 0) { int i; /* If nr is non-zero it means that 'pid' is valid and that * ns, i.e. the pid namespace associated with the procfs * instance, is in the pid namespace hierarchy of pid. * Start at one below the already printed level. */ for (i = ns->level + 1; i <= pid->level; i++) seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); } #endif seq_putc(m, '\n'); } #endif /* * Poll support for process exit notification. */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct pid *pid = pidfd_pid(file); bool thread = file->f_flags & PIDFD_THREAD; struct task_struct *task; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); /* * Depending on PIDFD_THREAD, inform pollers when the thread * or the whole thread-group exits. */ guard(rcu)(); task = pid_task(pid, PIDTYPE_PID); if (!task) poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; else if (task->exit_state && (thread || thread_group_empty(task))) poll_flags = EPOLLIN | EPOLLRDNORM; return poll_flags; } static const struct file_operations pidfs_file_operations = { .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif }; struct pid *pidfd_pid(const struct file *file) { if (file->f_op != &pidfs_file_operations) return ERR_PTR(-EBADF); return file_inode(file)->i_private; } static struct vfsmount *pidfs_mnt __ro_after_init; #if BITS_PER_LONG == 32 /* * Provide a fallback mechanism for 32-bit systems so processes remain * reliably comparable by inode number even on those systems. */ static DEFINE_IDA(pidfd_inum_ida); static int pidfs_inum(struct pid *pid, unsigned long *ino) { int ret; ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, UINT_MAX, GFP_ATOMIC); if (ret < 0) return -ENOSPC; *ino = ret; return 0; } static inline void pidfs_free_inum(unsigned long ino) { if (ino > 0) ida_free(&pidfd_inum_ida, ino); } #else static inline int pidfs_inum(struct pid *pid, unsigned long *ino) { *ino = pid->ino; return 0; } #define pidfs_free_inum(ino) ((void)(ino)) #endif /* * The vfs falls back to simple_setattr() if i_op->setattr() isn't * implemented. Let's reject it completely until we have a clean * permission concept for pidfds. */ static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } static const struct inode_operations pidfs_inode_operations = { .getattr = pidfs_getattr, .setattr = pidfs_setattr, }; static void pidfs_evict_inode(struct inode *inode) { struct pid *pid = inode->i_private; clear_inode(inode); put_pid(pid); pidfs_free_inum(inode->i_ino); } static const struct super_operations pidfs_sops = { .drop_inode = generic_delete_inode, .evict_inode = pidfs_evict_inode, .statfs = simple_statfs, }; static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = d_inode(dentry); struct pid *pid = inode->i_private; return dynamic_dname(buffer, buflen, "pidfd:[%llu]", pid->ino); } static const struct dentry_operations pidfs_dentry_operations = { .d_delete = always_delete_dentry, .d_dname = pidfs_dname, .d_prune = stashed_dentry_prune, }; static int pidfs_init_inode(struct inode *inode, void *data) { inode->i_private = data; inode->i_flags |= S_PRIVATE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; /* * Inode numbering for pidfs start at RESERVED_PIDS + 1. This * avoids collisions with the root inode which is 1 for pseudo * filesystems. */ return pidfs_inum(data, &inode->i_ino); } static void pidfs_put_data(void *data) { struct pid *pid = data; put_pid(pid); } static const struct stashed_operations pidfs_stashed_ops = { .init_inode = pidfs_init_inode, .put_data = pidfs_put_data, }; static int pidfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx; ctx = init_pseudo(fc, PID_FS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &pidfs_sops; ctx->dops = &pidfs_dentry_operations; fc->s_fs_info = (void *)&pidfs_stashed_ops; return 0; } static struct file_system_type pidfs_type = { .name = "pidfs", .init_fs_context = pidfs_init_fs_context, .kill_sb = kill_anon_super, }; struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) { struct file *pidfd_file; struct path path; int ret; ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) return ERR_PTR(ret); pidfd_file = dentry_open(&path, flags, current_cred()); path_put(&path); return pidfd_file; } void __init pidfs_init(void) { pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); } |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 | // SPDX-License-Identifier: GPL-2.0-or-later /* * MSI GT683R led driver * * Copyright (c) 2014 Janne Kanniainen <janne.kanniainen@gmail.com> */ #include <linux/device.h> #include <linux/hid.h> #include <linux/kernel.h> #include <linux/leds.h> #include <linux/module.h> #include "hid-ids.h" #define GT683R_BUFFER_SIZE 8 /* * GT683R_LED_OFF: all LEDs are off * GT683R_LED_AUDIO: LEDs brightness depends on sound level * GT683R_LED_BREATHING: LEDs brightness varies at human breathing rate * GT683R_LED_NORMAL: LEDs are fully on when enabled */ enum gt683r_led_mode { GT683R_LED_OFF = 0, GT683R_LED_AUDIO = 2, GT683R_LED_BREATHING = 3, GT683R_LED_NORMAL = 5 }; enum gt683r_panels { GT683R_LED_BACK = 0, GT683R_LED_SIDE = 1, GT683R_LED_FRONT = 2, GT683R_LED_COUNT, }; static const char * const gt683r_panel_names[] = { "back", "side", "front", }; struct gt683r_led { struct hid_device *hdev; struct led_classdev led_devs[GT683R_LED_COUNT]; struct mutex lock; struct work_struct work; enum led_brightness brightnesses[GT683R_LED_COUNT]; enum gt683r_led_mode mode; }; static const struct hid_device_id gt683r_led_id[] = { { HID_USB_DEVICE(USB_VENDOR_ID_MSI, USB_DEVICE_ID_MSI_GT683R_LED_PANEL) }, { } }; MODULE_DEVICE_TABLE(hid, gt683r_led_id); static void gt683r_brightness_set(struct led_classdev *led_cdev, enum led_brightness brightness) { int i; struct device *dev = led_cdev->dev->parent; struct hid_device *hdev = to_hid_device(dev); struct gt683r_led *led = hid_get_drvdata(hdev); for (i = 0; i < GT683R_LED_COUNT; i++) { if (led_cdev == &led->led_devs[i]) break; } if (i < GT683R_LED_COUNT) { led->brightnesses[i] = brightness; schedule_work(&led->work); } } static ssize_t mode_show(struct device *dev, struct device_attribute *attr, char *buf) { u8 sysfs_mode; struct hid_device *hdev = to_hid_device(dev->parent); struct gt683r_led *led = hid_get_drvdata(hdev); if (led->mode == GT683R_LED_NORMAL) sysfs_mode = 0; else if (led->mode == GT683R_LED_AUDIO) sysfs_mode = 1; else sysfs_mode = 2; return scnprintf(buf, PAGE_SIZE, "%u\n", sysfs_mode); } static ssize_t mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { u8 sysfs_mode; struct hid_device *hdev = to_hid_device(dev->parent); struct gt683r_led *led = hid_get_drvdata(hdev); if (kstrtou8(buf, 10, &sysfs_mode) || sysfs_mode > 2) return -EINVAL; mutex_lock(&led->lock); if (sysfs_mode == 0) led->mode = GT683R_LED_NORMAL; else if (sysfs_mode == 1) led->mode = GT683R_LED_AUDIO; else led->mode = GT683R_LED_BREATHING; mutex_unlock(&led->lock); schedule_work(&led->work); return count; } static int gt683r_led_snd_msg(struct gt683r_led *led, u8 *msg) { int ret; ret = hid_hw_raw_request(led->hdev, msg[0], msg, GT683R_BUFFER_SIZE, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); if (ret != GT683R_BUFFER_SIZE) { hid_err(led->hdev, "failed to send set report request: %i\n", ret); if (ret < 0) return ret; return -EIO; } return 0; } static int gt683r_leds_set(struct gt683r_led *led, u8 leds) { int ret; u8 *buffer; buffer = kzalloc(GT683R_BUFFER_SIZE, GFP_KERNEL); if (!buffer) return -ENOMEM; buffer[0] = 0x01; buffer[1] = 0x02; buffer[2] = 0x30; buffer[3] = leds; ret = gt683r_led_snd_msg(led, buffer); kfree(buffer); return ret; } static int gt683r_mode_set(struct gt683r_led *led, u8 mode) { int ret; u8 *buffer; buffer = kzalloc(GT683R_BUFFER_SIZE, GFP_KERNEL); if (!buffer) return -ENOMEM; buffer[0] = 0x01; buffer[1] = 0x02; buffer[2] = 0x20; buffer[3] = mode; buffer[4] = 0x01; ret = gt683r_led_snd_msg(led, buffer); kfree(buffer); return ret; } static void gt683r_led_work(struct work_struct *work) { int i; u8 leds = 0; u8 mode; struct gt683r_led *led = container_of(work, struct gt683r_led, work); mutex_lock(&led->lock); for (i = 0; i < GT683R_LED_COUNT; i++) { if (led->brightnesses[i]) leds |= BIT(i); } if (gt683r_leds_set(led, leds)) goto fail; if (leds) mode = led->mode; else mode = GT683R_LED_OFF; gt683r_mode_set(led, mode); fail: mutex_unlock(&led->lock); } static DEVICE_ATTR_RW(mode); static struct attribute *gt683r_led_attrs[] = { &dev_attr_mode.attr, NULL }; static const struct attribute_group gt683r_led_group = { .name = "gt683r", .attrs = gt683r_led_attrs, }; static const struct attribute_group *gt683r_led_groups[] = { >683r_led_group, NULL }; static int gt683r_led_probe(struct hid_device *hdev, const struct hid_device_id *id) { int i; int ret; int name_sz; char *name; struct gt683r_led *led; led = devm_kzalloc(&hdev->dev, sizeof(*led), GFP_KERNEL); if (!led) return -ENOMEM; mutex_init(&led->lock); INIT_WORK(&led->work, gt683r_led_work); led->mode = GT683R_LED_NORMAL; led->hdev = hdev; hid_set_drvdata(hdev, led); ret = hid_parse(hdev); if (ret) { hid_err(hdev, "hid parsing failed\n"); return ret; } ret = hid_hw_start(hdev, HID_CONNECT_HIDRAW); if (ret) { hid_err(hdev, "hw start failed\n"); return ret; } for (i = 0; i < GT683R_LED_COUNT; i++) { name_sz = strlen(dev_name(&hdev->dev)) + strlen(gt683r_panel_names[i]) + 3; name = devm_kzalloc(&hdev->dev, name_sz, GFP_KERNEL); if (!name) { ret = -ENOMEM; goto fail; } snprintf(name, name_sz, "%s::%s", dev_name(&hdev->dev), gt683r_panel_names[i]); led->led_devs[i].name = name; led->led_devs[i].max_brightness = 1; led->led_devs[i].brightness_set = gt683r_brightness_set; led->led_devs[i].groups = gt683r_led_groups; ret = led_classdev_register(&hdev->dev, &led->led_devs[i]); if (ret) { hid_err(hdev, "could not register led device\n"); goto fail; } } return 0; fail: for (i = i - 1; i >= 0; i--) led_classdev_unregister(&led->led_devs[i]); hid_hw_stop(hdev); return ret; } static void gt683r_led_remove(struct hid_device *hdev) { int i; struct gt683r_led *led = hid_get_drvdata(hdev); for (i = 0; i < GT683R_LED_COUNT; i++) led_classdev_unregister(&led->led_devs[i]); flush_work(&led->work); hid_hw_stop(hdev); } static struct hid_driver gt683r_led_driver = { .probe = gt683r_led_probe, .remove = gt683r_led_remove, .name = "gt683r_led", .id_table = gt683r_led_id, }; module_hid_driver(gt683r_led_driver); MODULE_AUTHOR("Janne Kanniainen"); MODULE_DESCRIPTION("MSI GT683R led driver"); MODULE_LICENSE("GPL"); |
56 9 4 9 20 52 51 52 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 | /* * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/in.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/ipv6.h> #include "rds_single_path.h" #include "rds.h" #include "loop.h" static DEFINE_SPINLOCK(loop_conns_lock); static LIST_HEAD(loop_conns); static atomic_t rds_loop_unloading = ATOMIC_INIT(0); static void rds_loop_set_unloading(void) { atomic_set(&rds_loop_unloading, 1); } static bool rds_loop_is_unloading(struct rds_connection *conn) { return atomic_read(&rds_loop_unloading) != 0; } /* * This 'loopback' transport is a special case for flows that originate * and terminate on the same machine. * * Connection build-up notices if the destination address is thought of * as a local address by a transport. At that time it decides to use the * loopback transport instead of the bound transport of the sending socket. * * The loopback transport's sending path just hands the sent rds_message * straight to the receiving path via an embedded rds_incoming. */ /* * Usually a message transits both the sender and receiver's conns as it * flows to the receiver. In the loopback case, though, the receive path * is handed the sending conn so the sense of the addresses is reversed. */ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { struct scatterlist *sgp = &rm->data.op_sg[sg]; int ret = sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); /* Do not send cong updates to loopback */ if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { rds_cong_map_updated(conn->c_fcong, ~(u64) 0); ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off); goto out; } BUG_ON(hdr_off || sg || off); rds_inc_init(&rm->m_inc, conn, &conn->c_laddr); /* For the embedded inc. Matching put is in loop_inc_free() */ rds_message_addref(rm); rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc, GFP_KERNEL); rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), NULL); rds_inc_put(&rm->m_inc); out: return ret; } /* * See rds_loop_xmit(). Since our inc is embedded in the rm, we * make sure the rm lives at least until the inc is done. */ static void rds_loop_inc_free(struct rds_incoming *inc) { struct rds_message *rm = container_of(inc, struct rds_message, m_inc); rds_message_put(rm); } /* we need to at least give the thread something to succeed */ static int rds_loop_recv_path(struct rds_conn_path *cp) { return 0; } struct rds_loop_connection { struct list_head loop_node; struct rds_connection *conn; }; /* * Even the loopback transport needs to keep track of its connections, * so it can call rds_conn_destroy() on them on exit. N.B. there are * 1+ loopback addresses (127.*.*.*) so it's not a bug to have * multiple loopback conns allocated, although rather useless. */ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) { struct rds_loop_connection *lc; unsigned long flags; lc = kzalloc(sizeof(struct rds_loop_connection), gfp); if (!lc) return -ENOMEM; INIT_LIST_HEAD(&lc->loop_node); lc->conn = conn; conn->c_transport_data = lc; spin_lock_irqsave(&loop_conns_lock, flags); list_add_tail(&lc->loop_node, &loop_conns); spin_unlock_irqrestore(&loop_conns_lock, flags); return 0; } static void rds_loop_conn_free(void *arg) { struct rds_loop_connection *lc = arg; unsigned long flags; rdsdebug("lc %p\n", lc); spin_lock_irqsave(&loop_conns_lock, flags); list_del(&lc->loop_node); spin_unlock_irqrestore(&loop_conns_lock, flags); kfree(lc); } static int rds_loop_conn_path_connect(struct rds_conn_path *cp) { rds_connect_complete(cp->cp_conn); return 0; } static void rds_loop_conn_path_shutdown(struct rds_conn_path *cp) { } void rds_loop_exit(void) { struct rds_loop_connection *lc, *_lc; LIST_HEAD(tmp_list); rds_loop_set_unloading(); synchronize_rcu(); /* avoid calling conn_destroy with irqs off */ spin_lock_irq(&loop_conns_lock); list_splice(&loop_conns, &tmp_list); INIT_LIST_HEAD(&loop_conns); spin_unlock_irq(&loop_conns_lock); list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) { WARN_ON(lc->conn->c_passive); rds_conn_destroy(lc->conn); } } static void rds_loop_kill_conns(struct net *net) { struct rds_loop_connection *lc, *_lc; LIST_HEAD(tmp_list); spin_lock_irq(&loop_conns_lock); list_for_each_entry_safe(lc, _lc, &loop_conns, loop_node) { struct net *c_net = read_pnet(&lc->conn->c_net); if (net != c_net) continue; list_move_tail(&lc->loop_node, &tmp_list); } spin_unlock_irq(&loop_conns_lock); list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) { WARN_ON(lc->conn->c_passive); rds_conn_destroy(lc->conn); } } static void __net_exit rds_loop_exit_net(struct net *net) { rds_loop_kill_conns(net); } static struct pernet_operations rds_loop_net_ops = { .exit = rds_loop_exit_net, }; int rds_loop_net_init(void) { return register_pernet_device(&rds_loop_net_ops); } void rds_loop_net_exit(void) { unregister_pernet_device(&rds_loop_net_ops); } /* * This is missing .xmit_* because loop doesn't go through generic * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and * .laddr_check are missing because transport.c doesn't iterate over * rds_loop_transport. */ struct rds_transport rds_loop_transport = { .xmit = rds_loop_xmit, .recv_path = rds_loop_recv_path, .conn_alloc = rds_loop_conn_alloc, .conn_free = rds_loop_conn_free, .conn_path_connect = rds_loop_conn_path_connect, .conn_path_shutdown = rds_loop_conn_path_shutdown, .inc_copy_to_user = rds_message_inc_copy_to_user, .inc_free = rds_loop_inc_free, .t_name = "loopback", .t_type = RDS_TRANS_LOOP, .t_unloading = rds_loop_is_unloading, }; |
5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 | // SPDX-License-Identifier: GPL-2.0-only /* * i8042 keyboard and mouse controller driver for Linux * * Copyright (c) 1999-2004 Vojtech Pavlik */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/serio.h> #include <linux/err.h> #include <linux/rcupdate.h> #include <linux/platform_device.h> #include <linux/i8042.h> #include <linux/slab.h> #include <linux/suspend.h> #include <linux/property.h> #include <asm/io.h> MODULE_AUTHOR("Vojtech Pavlik <vojtech@suse.cz>"); MODULE_DESCRIPTION("i8042 keyboard and mouse controller driver"); MODULE_LICENSE("GPL"); static bool i8042_nokbd; module_param_named(nokbd, i8042_nokbd, bool, 0); MODULE_PARM_DESC(nokbd, "Do not probe or use KBD port."); static bool i8042_noaux; module_param_named(noaux, i8042_noaux, bool, 0); MODULE_PARM_DESC(noaux, "Do not probe or use AUX (mouse) port."); static bool i8042_nomux; module_param_named(nomux, i8042_nomux, bool, 0); MODULE_PARM_DESC(nomux, "Do not check whether an active multiplexing controller is present."); static bool i8042_unlock; module_param_named(unlock, i8042_unlock, bool, 0); MODULE_PARM_DESC(unlock, "Ignore keyboard lock."); static bool i8042_probe_defer; module_param_named(probe_defer, i8042_probe_defer, bool, 0); MODULE_PARM_DESC(probe_defer, "Allow deferred probing."); enum i8042_controller_reset_mode { I8042_RESET_NEVER, I8042_RESET_ALWAYS, I8042_RESET_ON_S2RAM, #define I8042_RESET_DEFAULT I8042_RESET_ON_S2RAM }; static enum i8042_controller_reset_mode i8042_reset = I8042_RESET_DEFAULT; static int i8042_set_reset(const char *val, const struct kernel_param *kp) { enum i8042_controller_reset_mode *arg = kp->arg; int error; bool reset; if (val) { error = kstrtobool(val, &reset); if (error) return error; } else { reset = true; } *arg = reset ? I8042_RESET_ALWAYS : I8042_RESET_NEVER; return 0; } static const struct kernel_param_ops param_ops_reset_param = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = i8042_set_reset, }; #define param_check_reset_param(name, p) \ __param_check(name, p, enum i8042_controller_reset_mode) module_param_named(reset, i8042_reset, reset_param, 0); MODULE_PARM_DESC(reset, "Reset controller on resume, cleanup or both"); static bool i8042_direct; module_param_named(direct, i8042_direct, bool, 0); MODULE_PARM_DESC(direct, "Put keyboard port into non-translated mode."); static bool i8042_dumbkbd; module_param_named(dumbkbd, i8042_dumbkbd, bool, 0); MODULE_PARM_DESC(dumbkbd, "Pretend that controller can only read data from keyboard"); static bool i8042_noloop; module_param_named(noloop, i8042_noloop, bool, 0); MODULE_PARM_DESC(noloop, "Disable the AUX Loopback command while probing for the AUX port"); static bool i8042_notimeout; module_param_named(notimeout, i8042_notimeout, bool, 0); MODULE_PARM_DESC(notimeout, "Ignore timeouts signalled by i8042"); static bool i8042_kbdreset; module_param_named(kbdreset, i8042_kbdreset, bool, 0); MODULE_PARM_DESC(kbdreset, "Reset device connected to KBD port"); #ifdef CONFIG_X86 static bool i8042_dritek; module_param_named(dritek, i8042_dritek, bool, 0); MODULE_PARM_DESC(dritek, "Force enable the Dritek keyboard extension"); #endif #ifdef CONFIG_PNP static bool i8042_nopnp; module_param_named(nopnp, i8042_nopnp, bool, 0); MODULE_PARM_DESC(nopnp, "Do not use PNP to detect controller settings"); #endif #define DEBUG #ifdef DEBUG static bool i8042_debug; module_param_named(debug, i8042_debug, bool, 0600); MODULE_PARM_DESC(debug, "Turn i8042 debugging mode on and off"); static bool i8042_unmask_kbd_data; module_param_named(unmask_kbd_data, i8042_unmask_kbd_data, bool, 0600); MODULE_PARM_DESC(unmask_kbd_data, "Unconditional enable (may reveal sensitive data) of normally sanitize-filtered kbd data traffic debug log [pre-condition: i8042.debug=1 enabled]"); #endif static bool i8042_present; static bool i8042_bypass_aux_irq_test; static char i8042_kbd_firmware_id[128]; static char i8042_aux_firmware_id[128]; static struct fwnode_handle *i8042_kbd_fwnode; #include "i8042.h" /* * i8042_lock protects serialization between i8042_command and * the interrupt handler. */ static DEFINE_SPINLOCK(i8042_lock); /* * Writers to AUX and KBD ports as well as users issuing i8042_command * directly should acquire i8042_mutex (by means of calling * i8042_lock_chip() and i8042_unlock_chip() helpers) to ensure that * they do not disturb each other (unfortunately in many i8042 * implementations write to one of the ports will immediately abort * command that is being processed by another port). */ static DEFINE_MUTEX(i8042_mutex); struct i8042_port { struct serio *serio; int irq; bool exists; bool driver_bound; signed char mux; }; #define I8042_KBD_PORT_NO 0 #define I8042_AUX_PORT_NO 1 #define I8042_MUX_PORT_NO 2 #define I8042_NUM_PORTS (I8042_NUM_MUX_PORTS + 2) static struct i8042_port i8042_ports[I8042_NUM_PORTS]; static unsigned char i8042_initial_ctr; static unsigned char i8042_ctr; static bool i8042_mux_present; static bool i8042_kbd_irq_registered; static bool i8042_aux_irq_registered; static unsigned char i8042_suppress_kbd_ack; static struct platform_device *i8042_platform_device; static struct notifier_block i8042_kbd_bind_notifier_block; static irqreturn_t i8042_interrupt(int irq, void *dev_id); static bool (*i8042_platform_filter)(unsigned char data, unsigned char str, struct serio *serio); void i8042_lock_chip(void) { mutex_lock(&i8042_mutex); } EXPORT_SYMBOL(i8042_lock_chip); void i8042_unlock_chip(void) { mutex_unlock(&i8042_mutex); } EXPORT_SYMBOL(i8042_unlock_chip); int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, struct serio *serio)) { unsigned long flags; int ret = 0; spin_lock_irqsave(&i8042_lock, flags); if (i8042_platform_filter) { ret = -EBUSY; goto out; } i8042_platform_filter = filter; out: spin_unlock_irqrestore(&i8042_lock, flags); return ret; } EXPORT_SYMBOL(i8042_install_filter); int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str, struct serio *port)) { unsigned long flags; int ret = 0; spin_lock_irqsave(&i8042_lock, flags); if (i8042_platform_filter != filter) { ret = -EINVAL; goto out; } i8042_platform_filter = NULL; out: spin_unlock_irqrestore(&i8042_lock, flags); return ret; } EXPORT_SYMBOL(i8042_remove_filter); /* * The i8042_wait_read() and i8042_wait_write functions wait for the i8042 to * be ready for reading values from it / writing values to it. * Called always with i8042_lock held. */ static int i8042_wait_read(void) { int i = 0; while ((~i8042_read_status() & I8042_STR_OBF) && (i < I8042_CTL_TIMEOUT)) { udelay(50); i++; } return -(i == I8042_CTL_TIMEOUT); } static int i8042_wait_write(void) { int i = 0; while ((i8042_read_status() & I8042_STR_IBF) && (i < I8042_CTL_TIMEOUT)) { udelay(50); i++; } return -(i == I8042_CTL_TIMEOUT); } /* * i8042_flush() flushes all data that may be in the keyboard and mouse buffers * of the i8042 down the toilet. */ static int i8042_flush(void) { unsigned long flags; unsigned char data, str; int count = 0; int retval = 0; spin_lock_irqsave(&i8042_lock, flags); while ((str = i8042_read_status()) & I8042_STR_OBF) { if (count++ < I8042_BUFFER_SIZE) { udelay(50); data = i8042_read_data(); dbg("%02x <- i8042 (flush, %s)\n", data, str & I8042_STR_AUXDATA ? "aux" : "kbd"); } else { retval = -EIO; break; } } spin_unlock_irqrestore(&i8042_lock, flags); return retval; } /* * i8042_command() executes a command on the i8042. It also sends the input * parameter(s) of the commands to it, and receives the output value(s). The * parameters are to be stored in the param array, and the output is placed * into the same array. The number of the parameters and output values is * encoded in bits 8-11 of the command number. */ static int __i8042_command(unsigned char *param, int command) { int i, error; if (i8042_noloop && command == I8042_CMD_AUX_LOOP) return -1; error = i8042_wait_write(); if (error) return error; dbg("%02x -> i8042 (command)\n", command & 0xff); i8042_write_command(command & 0xff); for (i = 0; i < ((command >> 12) & 0xf); i++) { error = i8042_wait_write(); if (error) { dbg(" -- i8042 (wait write timeout)\n"); return error; } dbg("%02x -> i8042 (parameter)\n", param[i]); i8042_write_data(param[i]); } for (i = 0; i < ((command >> 8) & 0xf); i++) { error = i8042_wait_read(); if (error) { dbg(" -- i8042 (wait read timeout)\n"); return error; } if (command == I8042_CMD_AUX_LOOP && !(i8042_read_status() & I8042_STR_AUXDATA)) { dbg(" -- i8042 (auxerr)\n"); return -1; } param[i] = i8042_read_data(); dbg("%02x <- i8042 (return)\n", param[i]); } return 0; } int i8042_command(unsigned char *param, int command) { unsigned long flags; int retval; if (!i8042_present) return -1; spin_lock_irqsave(&i8042_lock, flags); retval = __i8042_command(param, command); spin_unlock_irqrestore(&i8042_lock, flags); return retval; } EXPORT_SYMBOL(i8042_command); /* * i8042_kbd_write() sends a byte out through the keyboard interface. */ static int i8042_kbd_write(struct serio *port, unsigned char c) { unsigned long flags; int retval = 0; spin_lock_irqsave(&i8042_lock, flags); if (!(retval = i8042_wait_write())) { dbg("%02x -> i8042 (kbd-data)\n", c); i8042_write_data(c); } spin_unlock_irqrestore(&i8042_lock, flags); return retval; } /* * i8042_aux_write() sends a byte out through the aux interface. */ static int i8042_aux_write(struct serio *serio, unsigned char c) { struct i8042_port *port = serio->port_data; return i8042_command(&c, port->mux == -1 ? I8042_CMD_AUX_SEND : I8042_CMD_MUX_SEND + port->mux); } /* * i8042_port_close attempts to clear AUX or KBD port state by disabling * and then re-enabling it. */ static void i8042_port_close(struct serio *serio) { int irq_bit; int disable_bit; const char *port_name; if (serio == i8042_ports[I8042_AUX_PORT_NO].serio) { irq_bit = I8042_CTR_AUXINT; disable_bit = I8042_CTR_AUXDIS; port_name = "AUX"; } else { irq_bit = I8042_CTR_KBDINT; disable_bit = I8042_CTR_KBDDIS; port_name = "KBD"; } i8042_ctr &= ~irq_bit; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) pr_warn("Can't write CTR while closing %s port\n", port_name); udelay(50); i8042_ctr &= ~disable_bit; i8042_ctr |= irq_bit; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) pr_err("Can't reactivate %s port\n", port_name); /* * See if there is any data appeared while we were messing with * port state. */ i8042_interrupt(0, NULL); } /* * i8042_start() is called by serio core when port is about to finish * registering. It will mark port as existing so i8042_interrupt can * start sending data through it. */ static int i8042_start(struct serio *serio) { struct i8042_port *port = serio->port_data; device_set_wakeup_capable(&serio->dev, true); /* * On platforms using suspend-to-idle, allow the keyboard to * wake up the system from sleep by enabling keyboard wakeups * by default. This is consistent with keyboard wakeup * behavior on many platforms using suspend-to-RAM (ACPI S3) * by default. */ if (pm_suspend_default_s2idle() && serio == i8042_ports[I8042_KBD_PORT_NO].serio) { device_set_wakeup_enable(&serio->dev, true); } spin_lock_irq(&i8042_lock); port->exists = true; spin_unlock_irq(&i8042_lock); return 0; } /* * i8042_stop() marks serio port as non-existing so i8042_interrupt * will not try to send data to the port that is about to go away. * The function is called by serio core as part of unregister procedure. */ static void i8042_stop(struct serio *serio) { struct i8042_port *port = serio->port_data; spin_lock_irq(&i8042_lock); port->exists = false; port->serio = NULL; spin_unlock_irq(&i8042_lock); /* * We need to make sure that interrupt handler finishes using * our serio port before we return from this function. * We synchronize with both AUX and KBD IRQs because there is * a (very unlikely) chance that AUX IRQ is raised for KBD port * and vice versa. */ synchronize_irq(I8042_AUX_IRQ); synchronize_irq(I8042_KBD_IRQ); } /* * i8042_filter() filters out unwanted bytes from the input data stream. * It is called from i8042_interrupt and thus is running with interrupts * off and i8042_lock held. */ static bool i8042_filter(unsigned char data, unsigned char str, struct serio *serio) { if (unlikely(i8042_suppress_kbd_ack)) { if ((~str & I8042_STR_AUXDATA) && (data == 0xfa || data == 0xfe)) { i8042_suppress_kbd_ack--; dbg("Extra keyboard ACK - filtered out\n"); return true; } } if (i8042_platform_filter && i8042_platform_filter(data, str, serio)) { dbg("Filtered out by platform filter\n"); return true; } return false; } /* * i8042_interrupt() is the most important function in this driver - * it handles the interrupts from the i8042, and sends incoming bytes * to the upper layers. */ static irqreturn_t i8042_interrupt(int irq, void *dev_id) { struct i8042_port *port; struct serio *serio; unsigned long flags; unsigned char str, data; unsigned int dfl; unsigned int port_no; bool filtered; int ret = 1; spin_lock_irqsave(&i8042_lock, flags); str = i8042_read_status(); if (unlikely(~str & I8042_STR_OBF)) { spin_unlock_irqrestore(&i8042_lock, flags); if (irq) dbg("Interrupt %d, without any data\n", irq); ret = 0; goto out; } data = i8042_read_data(); if (i8042_mux_present && (str & I8042_STR_AUXDATA)) { static unsigned long last_transmit; static unsigned char last_str; dfl = 0; if (str & I8042_STR_MUXERR) { dbg("MUX error, status is %02x, data is %02x\n", str, data); /* * When MUXERR condition is signalled the data register can only contain * 0xfd, 0xfe or 0xff if implementation follows the spec. Unfortunately * it is not always the case. Some KBCs also report 0xfc when there is * nothing connected to the port while others sometimes get confused which * port the data came from and signal error leaving the data intact. They * _do not_ revert to legacy mode (actually I've never seen KBC reverting * to legacy mode yet, when we see one we'll add proper handling). * Anyway, we process 0xfc, 0xfd, 0xfe and 0xff as timeouts, and for the * rest assume that the data came from the same serio last byte * was transmitted (if transmission happened not too long ago). */ switch (data) { default: if (time_before(jiffies, last_transmit + HZ/10)) { str = last_str; break; } fallthrough; /* report timeout */ case 0xfc: case 0xfd: case 0xfe: dfl = SERIO_TIMEOUT; data = 0xfe; break; case 0xff: dfl = SERIO_PARITY; data = 0xfe; break; } } port_no = I8042_MUX_PORT_NO + ((str >> 6) & 3); last_str = str; last_transmit = jiffies; } else { dfl = ((str & I8042_STR_PARITY) ? SERIO_PARITY : 0) | ((str & I8042_STR_TIMEOUT && !i8042_notimeout) ? SERIO_TIMEOUT : 0); port_no = (str & I8042_STR_AUXDATA) ? I8042_AUX_PORT_NO : I8042_KBD_PORT_NO; } port = &i8042_ports[port_no]; serio = port->exists ? port->serio : NULL; filter_dbg(port->driver_bound, data, "<- i8042 (interrupt, %d, %d%s%s)\n", port_no, irq, dfl & SERIO_PARITY ? ", bad parity" : "", dfl & SERIO_TIMEOUT ? ", timeout" : ""); filtered = i8042_filter(data, str, serio); spin_unlock_irqrestore(&i8042_lock, flags); if (likely(serio && !filtered)) serio_interrupt(serio, data, dfl); out: return IRQ_RETVAL(ret); } /* * i8042_enable_kbd_port enables keyboard port on chip */ static int i8042_enable_kbd_port(void) { i8042_ctr &= ~I8042_CTR_KBDDIS; i8042_ctr |= I8042_CTR_KBDINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_KBDINT; i8042_ctr |= I8042_CTR_KBDDIS; pr_err("Failed to enable KBD port\n"); return -EIO; } return 0; } /* * i8042_enable_aux_port enables AUX (mouse) port on chip */ static int i8042_enable_aux_port(void) { i8042_ctr &= ~I8042_CTR_AUXDIS; i8042_ctr |= I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_AUXINT; i8042_ctr |= I8042_CTR_AUXDIS; pr_err("Failed to enable AUX port\n"); return -EIO; } return 0; } /* * i8042_enable_mux_ports enables 4 individual AUX ports after * the controller has been switched into Multiplexed mode */ static int i8042_enable_mux_ports(void) { unsigned char param; int i; for (i = 0; i < I8042_NUM_MUX_PORTS; i++) { i8042_command(¶m, I8042_CMD_MUX_PFX + i); i8042_command(¶m, I8042_CMD_AUX_ENABLE); } return i8042_enable_aux_port(); } /* * i8042_set_mux_mode checks whether the controller has an * active multiplexor and puts the chip into Multiplexed (true) * or Legacy (false) mode. */ static int i8042_set_mux_mode(bool multiplex, unsigned char *mux_version) { unsigned char param, val; /* * Get rid of bytes in the queue. */ i8042_flush(); /* * Internal loopback test - send three bytes, they should come back from the * mouse interface, the last should be version. */ param = val = 0xf0; if (i8042_command(¶m, I8042_CMD_AUX_LOOP) || param != val) return -1; param = val = multiplex ? 0x56 : 0xf6; if (i8042_command(¶m, I8042_CMD_AUX_LOOP) || param != val) return -1; param = val = multiplex ? 0xa4 : 0xa5; if (i8042_command(¶m, I8042_CMD_AUX_LOOP) || param == val) return -1; /* * Workaround for interference with USB Legacy emulation * that causes a v10.12 MUX to be found. */ if (param == 0xac) return -1; if (mux_version) *mux_version = param; return 0; } /* * i8042_check_mux() checks whether the controller supports the PS/2 Active * Multiplexing specification by Synaptics, Phoenix, Insyde and * LCS/Telegraphics. */ static int i8042_check_mux(void) { unsigned char mux_version; if (i8042_set_mux_mode(true, &mux_version)) return -1; pr_info("Detected active multiplexing controller, rev %d.%d\n", (mux_version >> 4) & 0xf, mux_version & 0xf); /* * Disable all muxed ports by disabling AUX. */ i8042_ctr |= I8042_CTR_AUXDIS; i8042_ctr &= ~I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_err("Failed to disable AUX port, can't use MUX\n"); return -EIO; } i8042_mux_present = true; return 0; } /* * The following is used to test AUX IRQ delivery. */ static struct completion i8042_aux_irq_delivered; static bool i8042_irq_being_tested; static irqreturn_t i8042_aux_test_irq(int irq, void *dev_id) { unsigned long flags; unsigned char str, data; int ret = 0; spin_lock_irqsave(&i8042_lock, flags); str = i8042_read_status(); if (str & I8042_STR_OBF) { data = i8042_read_data(); dbg("%02x <- i8042 (aux_test_irq, %s)\n", data, str & I8042_STR_AUXDATA ? "aux" : "kbd"); if (i8042_irq_being_tested && data == 0xa5 && (str & I8042_STR_AUXDATA)) complete(&i8042_aux_irq_delivered); ret = 1; } spin_unlock_irqrestore(&i8042_lock, flags); return IRQ_RETVAL(ret); } /* * i8042_toggle_aux - enables or disables AUX port on i8042 via command and * verifies success by readinng CTR. Used when testing for presence of AUX * port. */ static int i8042_toggle_aux(bool on) { unsigned char param; int i; if (i8042_command(¶m, on ? I8042_CMD_AUX_ENABLE : I8042_CMD_AUX_DISABLE)) return -1; /* some chips need some time to set the I8042_CTR_AUXDIS bit */ for (i = 0; i < 100; i++) { udelay(50); if (i8042_command(¶m, I8042_CMD_CTL_RCTR)) return -1; if (!(param & I8042_CTR_AUXDIS) == on) return 0; } return -1; } /* * i8042_check_aux() applies as much paranoia as it can at detecting * the presence of an AUX interface. */ static int i8042_check_aux(void) { int retval = -1; bool irq_registered = false; bool aux_loop_broken = false; unsigned long flags; unsigned char param; /* * Get rid of bytes in the queue. */ i8042_flush(); /* * Internal loopback test - filters out AT-type i8042's. Unfortunately * SiS screwed up and their 5597 doesn't support the LOOP command even * though it has an AUX port. */ param = 0x5a; retval = i8042_command(¶m, I8042_CMD_AUX_LOOP); if (retval || param != 0x5a) { /* * External connection test - filters out AT-soldered PS/2 i8042's * 0x00 - no error, 0x01-0x03 - clock/data stuck, 0xff - general error * 0xfa - no error on some notebooks which ignore the spec * Because it's common for chipsets to return error on perfectly functioning * AUX ports, we test for this only when the LOOP command failed. */ if (i8042_command(¶m, I8042_CMD_AUX_TEST) || (param && param != 0xfa && param != 0xff)) return -1; /* * If AUX_LOOP completed without error but returned unexpected data * mark it as broken */ if (!retval) aux_loop_broken = true; } /* * Bit assignment test - filters out PS/2 i8042's in AT mode */ if (i8042_toggle_aux(false)) { pr_warn("Failed to disable AUX port, but continuing anyway... Is this a SiS?\n"); pr_warn("If AUX port is really absent please use the 'i8042.noaux' option\n"); } if (i8042_toggle_aux(true)) return -1; /* * Reset keyboard (needed on some laptops to successfully detect * touchpad, e.g., some Gigabyte laptop models with Elantech * touchpads). */ if (i8042_kbdreset) { pr_warn("Attempting to reset device connected to KBD port\n"); i8042_kbd_write(NULL, (unsigned char) 0xff); } /* * Test AUX IRQ delivery to make sure BIOS did not grab the IRQ and * used it for a PCI card or somethig else. */ if (i8042_noloop || i8042_bypass_aux_irq_test || aux_loop_broken) { /* * Without LOOP command we can't test AUX IRQ delivery. Assume the port * is working and hope we are right. */ retval = 0; goto out; } if (request_irq(I8042_AUX_IRQ, i8042_aux_test_irq, IRQF_SHARED, "i8042", i8042_platform_device)) goto out; irq_registered = true; if (i8042_enable_aux_port()) goto out; spin_lock_irqsave(&i8042_lock, flags); init_completion(&i8042_aux_irq_delivered); i8042_irq_being_tested = true; param = 0xa5; retval = __i8042_command(¶m, I8042_CMD_AUX_LOOP & 0xf0ff); spin_unlock_irqrestore(&i8042_lock, flags); if (retval) goto out; if (wait_for_completion_timeout(&i8042_aux_irq_delivered, msecs_to_jiffies(250)) == 0) { /* * AUX IRQ was never delivered so we need to flush the controller to * get rid of the byte we put there; otherwise keyboard may not work. */ dbg(" -- i8042 (aux irq test timeout)\n"); i8042_flush(); retval = -1; } out: /* * Disable the interface. */ i8042_ctr |= I8042_CTR_AUXDIS; i8042_ctr &= ~I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) retval = -1; if (irq_registered) free_irq(I8042_AUX_IRQ, i8042_platform_device); return retval; } static int i8042_controller_check(void) { if (i8042_flush()) { pr_info("No controller found\n"); return -ENODEV; } return 0; } static int i8042_controller_selftest(void) { unsigned char param; int i = 0; /* * We try this 5 times; on some really fragile systems this does not * take the first time... */ do { if (i8042_command(¶m, I8042_CMD_CTL_TEST)) { pr_err("i8042 controller selftest timeout\n"); return -ENODEV; } if (param == I8042_RET_CTL_TEST) return 0; dbg("i8042 controller selftest: %#x != %#x\n", param, I8042_RET_CTL_TEST); msleep(50); } while (i++ < 5); #ifdef CONFIG_X86 /* * On x86, we don't fail entire i8042 initialization if controller * reset fails in hopes that keyboard port will still be functional * and user will still get a working keyboard. This is especially * important on netbooks. On other arches we trust hardware more. */ pr_info("giving up on controller selftest, continuing anyway...\n"); return 0; #else pr_err("i8042 controller selftest failed\n"); return -EIO; #endif } /* * i8042_controller_init initializes the i8042 controller, and, * most importantly, sets it into non-xlated mode if that's * desired. */ static int i8042_controller_init(void) { unsigned long flags; int n = 0; unsigned char ctr[2]; /* * Save the CTR for restore on unload / reboot. */ do { if (n >= 10) { pr_err("Unable to get stable CTR read\n"); return -EIO; } if (n != 0) udelay(50); if (i8042_command(&ctr[n++ % 2], I8042_CMD_CTL_RCTR)) { pr_err("Can't read CTR while initializing i8042\n"); return i8042_probe_defer ? -EPROBE_DEFER : -EIO; } } while (n < 2 || ctr[0] != ctr[1]); i8042_initial_ctr = i8042_ctr = ctr[0]; /* * Disable the keyboard interface and interrupt. */ i8042_ctr |= I8042_CTR_KBDDIS; i8042_ctr &= ~I8042_CTR_KBDINT; /* * Handle keylock. */ spin_lock_irqsave(&i8042_lock, flags); if (~i8042_read_status() & I8042_STR_KEYLOCK) { if (i8042_unlock) i8042_ctr |= I8042_CTR_IGNKEYLOCK; else pr_warn("Warning: Keylock active\n"); } spin_unlock_irqrestore(&i8042_lock, flags); /* * If the chip is configured into nontranslated mode by the BIOS, don't * bother enabling translating and be happy. */ if (~i8042_ctr & I8042_CTR_XLATE) i8042_direct = true; /* * Set nontranslated mode for the kbd interface if requested by an option. * After this the kbd interface becomes a simple serial in/out, like the aux * interface is. We don't do this by default, since it can confuse notebook * BIOSes. */ if (i8042_direct) i8042_ctr &= ~I8042_CTR_XLATE; /* * Write CTR back. */ if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_err("Can't write CTR while initializing i8042\n"); return -EIO; } /* * Flush whatever accumulated while we were disabling keyboard port. */ i8042_flush(); return 0; } /* * Reset the controller and reset CRT to the original value set by BIOS. */ static void i8042_controller_reset(bool s2r_wants_reset) { i8042_flush(); /* * Disable both KBD and AUX interfaces so they don't get in the way */ i8042_ctr |= I8042_CTR_KBDDIS | I8042_CTR_AUXDIS; i8042_ctr &= ~(I8042_CTR_KBDINT | I8042_CTR_AUXINT); if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) pr_warn("Can't write CTR while resetting\n"); /* * Disable MUX mode if present. */ if (i8042_mux_present) i8042_set_mux_mode(false, NULL); /* * Reset the controller if requested. */ if (i8042_reset == I8042_RESET_ALWAYS || (i8042_reset == I8042_RESET_ON_S2RAM && s2r_wants_reset)) { i8042_controller_selftest(); } /* * Restore the original control register setting. */ if (i8042_command(&i8042_initial_ctr, I8042_CMD_CTL_WCTR)) pr_warn("Can't restore CTR\n"); } /* * i8042_panic_blink() will turn the keyboard LEDs on or off and is called * when kernel panics. Flashing LEDs is useful for users running X who may * not see the console and will help distinguishing panics from "real" * lockups. * * Note that DELAY has a limit of 10ms so we will not get stuck here * waiting for KBC to free up even if KBD interrupt is off */ #define DELAY do { mdelay(1); if (++delay > 10) return delay; } while(0) static long i8042_panic_blink(int state) { long delay = 0; char led; led = (state) ? 0x01 | 0x04 : 0; while (i8042_read_status() & I8042_STR_IBF) DELAY; dbg("%02x -> i8042 (panic blink)\n", 0xed); i8042_suppress_kbd_ack = 2; i8042_write_data(0xed); /* set leds */ DELAY; while (i8042_read_status() & I8042_STR_IBF) DELAY; DELAY; dbg("%02x -> i8042 (panic blink)\n", led); i8042_write_data(led); DELAY; return delay; } #undef DELAY #ifdef CONFIG_X86 static void i8042_dritek_enable(void) { unsigned char param = 0x90; int error; error = i8042_command(¶m, 0x1059); if (error) pr_warn("Failed to enable DRITEK extension: %d\n", error); } #endif #ifdef CONFIG_PM /* * Here we try to reset everything back to a state we had * before suspending. */ static int i8042_controller_resume(bool s2r_wants_reset) { int error; error = i8042_controller_check(); if (error) return error; if (i8042_reset == I8042_RESET_ALWAYS || (i8042_reset == I8042_RESET_ON_S2RAM && s2r_wants_reset)) { error = i8042_controller_selftest(); if (error) return error; } /* * Restore original CTR value and disable all ports */ i8042_ctr = i8042_initial_ctr; if (i8042_direct) i8042_ctr &= ~I8042_CTR_XLATE; i8042_ctr |= I8042_CTR_AUXDIS | I8042_CTR_KBDDIS; i8042_ctr &= ~(I8042_CTR_AUXINT | I8042_CTR_KBDINT); if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_warn("Can't write CTR to resume, retrying...\n"); msleep(50); if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_err("CTR write retry failed\n"); return -EIO; } } #ifdef CONFIG_X86 if (i8042_dritek) i8042_dritek_enable(); #endif if (i8042_mux_present) { if (i8042_set_mux_mode(true, NULL) || i8042_enable_mux_ports()) pr_warn("failed to resume active multiplexor, mouse won't work\n"); } else if (i8042_ports[I8042_AUX_PORT_NO].serio) i8042_enable_aux_port(); if (i8042_ports[I8042_KBD_PORT_NO].serio) i8042_enable_kbd_port(); i8042_interrupt(0, NULL); return 0; } /* * Here we try to restore the original BIOS settings to avoid * upsetting it. */ static int i8042_pm_suspend(struct device *dev) { int i; if (pm_suspend_via_firmware()) i8042_controller_reset(true); /* Set up serio interrupts for system wakeup. */ for (i = 0; i < I8042_NUM_PORTS; i++) { struct serio *serio = i8042_ports[i].serio; if (serio && device_may_wakeup(&serio->dev)) enable_irq_wake(i8042_ports[i].irq); } return 0; } static int i8042_pm_resume_noirq(struct device *dev) { if (!pm_resume_via_firmware()) i8042_interrupt(0, NULL); return 0; } static int i8042_pm_resume(struct device *dev) { bool want_reset; int i; for (i = 0; i < I8042_NUM_PORTS; i++) { struct serio *serio = i8042_ports[i].serio; if (serio && device_may_wakeup(&serio->dev)) disable_irq_wake(i8042_ports[i].irq); } /* * If platform firmware was not going to be involved in suspend, we did * not restore the controller state to whatever it had been at boot * time, so we do not need to do anything. */ if (!pm_suspend_via_firmware()) return 0; /* * We only need to reset the controller if we are resuming after handing * off control to the platform firmware, otherwise we can simply restore * the mode. */ want_reset = pm_resume_via_firmware(); return i8042_controller_resume(want_reset); } static int i8042_pm_thaw(struct device *dev) { i8042_interrupt(0, NULL); return 0; } static int i8042_pm_reset(struct device *dev) { i8042_controller_reset(false); return 0; } static int i8042_pm_restore(struct device *dev) { return i8042_controller_resume(false); } static const struct dev_pm_ops i8042_pm_ops = { .suspend = i8042_pm_suspend, .resume_noirq = i8042_pm_resume_noirq, .resume = i8042_pm_resume, .thaw = i8042_pm_thaw, .poweroff = i8042_pm_reset, .restore = i8042_pm_restore, }; #endif /* CONFIG_PM */ /* * We need to reset the 8042 back to original mode on system shutdown, * because otherwise BIOSes will be confused. */ static void i8042_shutdown(struct platform_device *dev) { i8042_controller_reset(false); } static int i8042_create_kbd_port(void) { struct serio *serio; struct i8042_port *port = &i8042_ports[I8042_KBD_PORT_NO]; serio = kzalloc(sizeof(struct serio), GFP_KERNEL); if (!serio) return -ENOMEM; serio->id.type = i8042_direct ? SERIO_8042 : SERIO_8042_XL; serio->write = i8042_dumbkbd ? NULL : i8042_kbd_write; serio->start = i8042_start; serio->stop = i8042_stop; serio->close = i8042_port_close; serio->ps2_cmd_mutex = &i8042_mutex; serio->port_data = port; serio->dev.parent = &i8042_platform_device->dev; strscpy(serio->name, "i8042 KBD port", sizeof(serio->name)); strscpy(serio->phys, I8042_KBD_PHYS_DESC, sizeof(serio->phys)); strscpy(serio->firmware_id, i8042_kbd_firmware_id, sizeof(serio->firmware_id)); set_primary_fwnode(&serio->dev, i8042_kbd_fwnode); port->serio = serio; port->irq = I8042_KBD_IRQ; return 0; } static int i8042_create_aux_port(int idx) { struct serio *serio; int port_no = idx < 0 ? I8042_AUX_PORT_NO : I8042_MUX_PORT_NO + idx; struct i8042_port *port = &i8042_ports[port_no]; serio = kzalloc(sizeof(struct serio), GFP_KERNEL); if (!serio) return -ENOMEM; serio->id.type = SERIO_8042; serio->write = i8042_aux_write; serio->start = i8042_start; serio->stop = i8042_stop; serio->ps2_cmd_mutex = &i8042_mutex; serio->port_data = port; serio->dev.parent = &i8042_platform_device->dev; if (idx < 0) { strscpy(serio->name, "i8042 AUX port", sizeof(serio->name)); strscpy(serio->phys, I8042_AUX_PHYS_DESC, sizeof(serio->phys)); strscpy(serio->firmware_id, i8042_aux_firmware_id, sizeof(serio->firmware_id)); serio->close = i8042_port_close; } else { snprintf(serio->name, sizeof(serio->name), "i8042 AUX%d port", idx); snprintf(serio->phys, sizeof(serio->phys), I8042_MUX_PHYS_DESC, idx + 1); strscpy(serio->firmware_id, i8042_aux_firmware_id, sizeof(serio->firmware_id)); } port->serio = serio; port->mux = idx; port->irq = I8042_AUX_IRQ; return 0; } static void i8042_free_kbd_port(void) { kfree(i8042_ports[I8042_KBD_PORT_NO].serio); i8042_ports[I8042_KBD_PORT_NO].serio = NULL; } static void i8042_free_aux_ports(void) { int i; for (i = I8042_AUX_PORT_NO; i < I8042_NUM_PORTS; i++) { kfree(i8042_ports[i].serio); i8042_ports[i].serio = NULL; } } static void i8042_register_ports(void) { int i; for (i = 0; i < I8042_NUM_PORTS; i++) { struct serio *serio = i8042_ports[i].serio; if (!serio) continue; printk(KERN_INFO "serio: %s at %#lx,%#lx irq %d\n", serio->name, (unsigned long) I8042_DATA_REG, (unsigned long) I8042_COMMAND_REG, i8042_ports[i].irq); serio_register_port(serio); } } static void i8042_unregister_ports(void) { int i; for (i = 0; i < I8042_NUM_PORTS; i++) { if (i8042_ports[i].serio) { serio_unregister_port(i8042_ports[i].serio); i8042_ports[i].serio = NULL; } } } static void i8042_free_irqs(void) { if (i8042_aux_irq_registered) free_irq(I8042_AUX_IRQ, i8042_platform_device); if (i8042_kbd_irq_registered) free_irq(I8042_KBD_IRQ, i8042_platform_device); i8042_aux_irq_registered = i8042_kbd_irq_registered = false; } static int i8042_setup_aux(void) { int (*aux_enable)(void); int error; int i; if (i8042_check_aux()) return -ENODEV; if (i8042_nomux || i8042_check_mux()) { error = i8042_create_aux_port(-1); if (error) goto err_free_ports; aux_enable = i8042_enable_aux_port; } else { for (i = 0; i < I8042_NUM_MUX_PORTS; i++) { error = i8042_create_aux_port(i); if (error) goto err_free_ports; } aux_enable = i8042_enable_mux_ports; } error = request_irq(I8042_AUX_IRQ, i8042_interrupt, IRQF_SHARED, "i8042", i8042_platform_device); if (error) goto err_free_ports; error = aux_enable(); if (error) goto err_free_irq; i8042_aux_irq_registered = true; return 0; err_free_irq: free_irq(I8042_AUX_IRQ, i8042_platform_device); err_free_ports: i8042_free_aux_ports(); return error; } static int i8042_setup_kbd(void) { int error; error = i8042_create_kbd_port(); if (error) return error; error = request_irq(I8042_KBD_IRQ, i8042_interrupt, IRQF_SHARED, "i8042", i8042_platform_device); if (error) goto err_free_port; error = i8042_enable_kbd_port(); if (error) goto err_free_irq; i8042_kbd_irq_registered = true; return 0; err_free_irq: free_irq(I8042_KBD_IRQ, i8042_platform_device); err_free_port: i8042_free_kbd_port(); return error; } static int i8042_kbd_bind_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; struct serio *serio = to_serio_port(dev); struct i8042_port *port = serio->port_data; if (serio != i8042_ports[I8042_KBD_PORT_NO].serio) return 0; switch (action) { case BUS_NOTIFY_BOUND_DRIVER: port->driver_bound = true; break; case BUS_NOTIFY_UNBIND_DRIVER: port->driver_bound = false; break; } return 0; } static int i8042_probe(struct platform_device *dev) { int error; if (i8042_reset == I8042_RESET_ALWAYS) { error = i8042_controller_selftest(); if (error) return error; } error = i8042_controller_init(); if (error) return error; #ifdef CONFIG_X86 if (i8042_dritek) i8042_dritek_enable(); #endif if (!i8042_noaux) { error = i8042_setup_aux(); if (error && error != -ENODEV && error != -EBUSY) goto out_fail; } if (!i8042_nokbd) { error = i8042_setup_kbd(); if (error) goto out_fail; } /* * Ok, everything is ready, let's register all serio ports */ i8042_register_ports(); return 0; out_fail: i8042_free_aux_ports(); /* in case KBD failed but AUX not */ i8042_free_irqs(); i8042_controller_reset(false); return error; } static void i8042_remove(struct platform_device *dev) { i8042_unregister_ports(); i8042_free_irqs(); i8042_controller_reset(false); } static struct platform_driver i8042_driver = { .driver = { .name = "i8042", #ifdef CONFIG_PM .pm = &i8042_pm_ops, #endif }, .probe = i8042_probe, .remove_new = i8042_remove, .shutdown = i8042_shutdown, }; static struct notifier_block i8042_kbd_bind_notifier_block = { .notifier_call = i8042_kbd_bind_notifier, }; static int __init i8042_init(void) { int err; dbg_init(); err = i8042_platform_init(); if (err) return (err == -ENODEV) ? 0 : err; err = i8042_controller_check(); if (err) goto err_platform_exit; /* Set this before creating the dev to allow i8042_command to work right away */ i8042_present = true; err = platform_driver_register(&i8042_driver); if (err) goto err_platform_exit; i8042_platform_device = platform_device_alloc("i8042", -1); if (!i8042_platform_device) { err = -ENOMEM; goto err_unregister_driver; } err = platform_device_add(i8042_platform_device); if (err) goto err_free_device; bus_register_notifier(&serio_bus, &i8042_kbd_bind_notifier_block); panic_blink = i8042_panic_blink; return 0; err_free_device: platform_device_put(i8042_platform_device); err_unregister_driver: platform_driver_unregister(&i8042_driver); err_platform_exit: i8042_platform_exit(); return err; } static void __exit i8042_exit(void) { if (!i8042_present) return; platform_device_unregister(i8042_platform_device); platform_driver_unregister(&i8042_driver); i8042_platform_exit(); bus_unregister_notifier(&serio_bus, &i8042_kbd_bind_notifier_block); panic_blink = NULL; } module_init(i8042_init); module_exit(i8042_exit); |
11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _TCP_AO_H #define _TCP_AO_H #define TCP_AO_KEY_ALIGN 1 #define __tcp_ao_key_align __aligned(TCP_AO_KEY_ALIGN) union tcp_ao_addr { struct in_addr a4; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr a6; #endif }; struct tcp_ao_hdr { u8 kind; u8 length; u8 keyid; u8 rnext_keyid; }; struct tcp_ao_counters { atomic64_t pkt_good; atomic64_t pkt_bad; atomic64_t key_not_found; atomic64_t ao_required; atomic64_t dropped_icmp; }; struct tcp_ao_key { struct hlist_node node; union tcp_ao_addr addr; u8 key[TCP_AO_MAXKEYLEN] __tcp_ao_key_align; unsigned int tcp_sigpool_id; unsigned int digest_size; int l3index; u8 prefixlen; u8 family; u8 keylen; u8 keyflags; u8 sndid; u8 rcvid; u8 maclen; struct rcu_head rcu; atomic64_t pkt_good; atomic64_t pkt_bad; u8 traffic_keys[]; }; static inline u8 *rcv_other_key(struct tcp_ao_key *key) { return key->traffic_keys; } static inline u8 *snd_other_key(struct tcp_ao_key *key) { return key->traffic_keys + key->digest_size; } static inline int tcp_ao_maclen(const struct tcp_ao_key *key) { return key->maclen; } /* Use tcp_ao_len_aligned() for TCP header calculations */ static inline int tcp_ao_len(const struct tcp_ao_key *key) { return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr); } static inline int tcp_ao_len_aligned(const struct tcp_ao_key *key) { return round_up(tcp_ao_len(key), 4); } static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key) { return key->digest_size; } static inline int tcp_ao_sizeof_key(const struct tcp_ao_key *key) { return sizeof(struct tcp_ao_key) + (key->digest_size << 1); } struct tcp_ao_info { /* List of tcp_ao_key's */ struct hlist_head head; /* current_key and rnext_key aren't maintained on listen sockets. * Their purpose is to cache keys on established connections, * saving needless lookups. Never dereference any of them from * listen sockets. * ::current_key may change in RX to the key that was requested by * the peer, please use READ_ONCE()/WRITE_ONCE() in order to avoid * load/store tearing. * Do the same for ::rnext_key, if you don't hold socket lock * (it's changed only by userspace request in setsockopt()). */ struct tcp_ao_key *current_key; struct tcp_ao_key *rnext_key; struct tcp_ao_counters counters; u32 ao_required :1, accept_icmps :1, __unused :30; __be32 lisn; __be32 risn; /* Sequence Number Extension (SNE) are upper 4 bytes for SEQ, * that protect TCP-AO connection from replayed old TCP segments. * See RFC5925 (6.2). * In order to get correct SNE, there's a helper tcp_ao_compute_sne(). * It needs SEQ basis to understand whereabouts are lower SEQ numbers. * According to that basis vector, it can provide incremented SNE * when SEQ rolls over or provide decremented SNE when there's * a retransmitted segment from before-rolling over. * - for request sockets such basis is rcv_isn/snt_isn, which seems * good enough as it's unexpected to receive 4 Gbytes on reqsk. * - for full sockets the basis is rcv_nxt/snd_una. snd_una is * taken instead of snd_nxt as currently it's easier to track * in tcp_snd_una_update(), rather than updating SNE in all * WRITE_ONCE(tp->snd_nxt, ...) * - for time-wait sockets the basis is tw_rcv_nxt/tw_snd_nxt. * tw_snd_nxt is not expected to change, while tw_rcv_nxt may. */ u32 snd_sne; u32 rcv_sne; refcount_t refcnt; /* Protects twsk destruction */ struct rcu_head rcu; }; #ifdef CONFIG_TCP_MD5SIG #include <linux/jump_label.h> extern struct static_key_false_deferred tcp_md5_needed; #define static_branch_tcp_md5() static_branch_unlikely(&tcp_md5_needed.key) #else #define static_branch_tcp_md5() false #endif #ifdef CONFIG_TCP_AO /* TCP-AO structures and functions */ #include <linux/jump_label.h> extern struct static_key_false_deferred tcp_ao_needed; #define static_branch_tcp_ao() static_branch_unlikely(&tcp_ao_needed.key) #else #define static_branch_tcp_ao() false #endif static inline bool tcp_hash_should_produce_warnings(void) { return static_branch_tcp_md5() || static_branch_tcp_ao(); } #define tcp_hash_fail(msg, family, skb, fmt, ...) \ do { \ const struct tcphdr *th = tcp_hdr(skb); \ char hdr_flags[6]; \ char *f = hdr_flags; \ \ if (!tcp_hash_should_produce_warnings()) \ break; \ if (th->fin) \ *f++ = 'F'; \ if (th->syn) \ *f++ = 'S'; \ if (th->rst) \ *f++ = 'R'; \ if (th->psh) \ *f++ = 'P'; \ if (th->ack) \ *f++ = '.'; \ *f = 0; \ if ((family) == AF_INET) { \ net_info_ratelimited("%s for %pI4.%d->%pI4.%d [%s] " fmt "\n", \ msg, &ip_hdr(skb)->saddr, ntohs(th->source), \ &ip_hdr(skb)->daddr, ntohs(th->dest), \ hdr_flags, ##__VA_ARGS__); \ } else { \ net_info_ratelimited("%s for [%pI6c].%d->[%pI6c].%d [%s]" fmt "\n", \ msg, &ipv6_hdr(skb)->saddr, ntohs(th->source), \ &ipv6_hdr(skb)->daddr, ntohs(th->dest), \ hdr_flags, ##__VA_ARGS__); \ } \ } while (0) #ifdef CONFIG_TCP_AO /* TCP-AO structures and functions */ struct tcp4_ao_context { __be32 saddr; __be32 daddr; __be16 sport; __be16 dport; __be32 sisn; __be32 disn; }; struct tcp6_ao_context { struct in6_addr saddr; struct in6_addr daddr; __be16 sport; __be16 dport; __be32 sisn; __be32 disn; }; struct tcp_sigpool; #define TCP_AO_ESTABLISHED (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | \ TCPF_CLOSE | TCPF_CLOSE_WAIT | \ TCPF_LAST_ACK | TCPF_CLOSING) int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, struct tcp_ao_key *key, struct tcphdr *th, __u8 *hash_location); int tcp_ao_hash_skb(unsigned short int family, char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family, sockptr_t optval, int optlen); struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao, int sndid, int rcvid); int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, struct request_sock *req, struct sk_buff *skb, int family); int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx, unsigned int len, struct tcp_sigpool *hp); void tcp_ao_destroy_sock(struct sock *sk, bool twsk); void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp); bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code); int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen); enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, int l3index, const struct tcp_ao_hdr *aoh); u32 tcp_ao_compute_sne(u32 next_sne, u32 next_seq, u32 seq); struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index, const union tcp_ao_addr *addr, int family, int sndid, int rcvid); int tcp_ao_hash_hdr(unsigned short family, char *ao_hash, struct tcp_ao_key *key, const u8 *tkey, const union tcp_ao_addr *daddr, const union tcp_ao_addr *saddr, const struct tcphdr *th, u32 sne); int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb, const struct tcp_ao_hdr *aoh, int l3index, u32 seq, struct tcp_ao_key **key, char **traffic_key, bool *allocated_traffic_key, u8 *keyid, u32 *sne); /* ipv4 specific functions */ int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk, int sndid, int rcvid); int tcp_v4_ao_synack_hash(char *ao_hash, struct tcp_ao_key *mkt, struct request_sock *req, const struct sk_buff *skb, int hash_offset, u32 sne); int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, const struct sock *sk, __be32 sisn, __be32 disn, bool send); int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, struct request_sock *req); struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk, struct request_sock *req, int sndid, int rcvid); int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); /* ipv6 specific functions */ int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp, const struct in6_addr *daddr, const struct in6_addr *saddr, int nbytes); int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key, const struct sk_buff *skb, __be32 sisn, __be32 disn); int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, const struct sock *sk, __be32 sisn, __be32 disn, bool send); int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, struct request_sock *req); struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk, struct sock *addr_sk, int sndid, int rcvid); struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk, struct request_sock *req, int sndid, int rcvid); int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); int tcp_v6_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key, struct request_sock *req, const struct sk_buff *skb, int hash_offset, u32 sne); void tcp_ao_established(struct sock *sk); void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb); void tcp_ao_connect_init(struct sock *sk); void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, struct request_sock *req, unsigned short int family); #else /* CONFIG_TCP_AO */ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, struct tcp_ao_key *key, struct tcphdr *th, __u8 *hash_location) { return 0; } static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, struct request_sock *req, unsigned short int family) { } static inline bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code) { return false; } static inline enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, int l3index, const struct tcp_ao_hdr *aoh) { return SKB_NOT_DROPPED_YET; } static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index, const union tcp_ao_addr *addr, int family, int sndid, int rcvid) { return NULL; } static inline void tcp_ao_destroy_sock(struct sock *sk, bool twsk) { } static inline void tcp_ao_established(struct sock *sk) { } static inline void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) { } static inline void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp) { } static inline void tcp_ao_connect_init(struct sock *sk) { } static inline int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash); #else static inline int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash) { *md5_hash = NULL; *ao_hash = NULL; return 0; } #endif #endif /* _TCP_AO_H */ |
4 4 3 4 4 4 3 3 4 4 4 3 3 3 3 1 4 4 3 3 1 1 4 4 4 1 3 4 4 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 | // SPDX-License-Identifier: GPL-2.0 /* * Write ahead logging implementation copyright Chris Mason 2000 * * The background commits make this code very interrelated, and * overly complex. I need to rethink things a bit....The major players: * * journal_begin -- call with the number of blocks you expect to log. * If the current transaction is too * old, it will block until the current transaction is * finished, and then start a new one. * Usually, your transaction will get joined in with * previous ones for speed. * * journal_join -- same as journal_begin, but won't block on the current * transaction regardless of age. Don't ever call * this. Ever. There are only two places it should be * called from, and they are both inside this file. * * journal_mark_dirty -- adds blocks into this transaction. clears any flags * that might make them get sent to disk * and then marks them BH_JDirty. Puts the buffer head * into the current transaction hash. * * journal_end -- if the current transaction is batchable, it does nothing * otherwise, it could do an async/synchronous commit, or * a full flush of all log and real blocks in the * transaction. * * flush_old_commits -- if the current transaction is too old, it is ended and * commit blocks are sent to disk. Forces commit blocks * to disk for all backgrounded commits that have been * around too long. * -- Note, if you call this as an immediate flush from * within kupdate, it will ignore the immediate flag */ #include <linux/time.h> #include <linux/semaphore.h> #include <linux/vmalloc.h> #include "reiserfs.h" #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/workqueue.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/uaccess.h> #include <linux/slab.h> /* gets a struct reiserfs_journal_list * from a list head */ #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ j_list)) /* must be correct to keep the desc and commit structs at 4k */ #define JOURNAL_TRANS_HALF 1018 #define BUFNR 64 /*read ahead */ /* cnode stat bits. Move these into reiserfs_fs.h */ /* this block was freed, and can't be written. */ #define BLOCK_FREED 2 /* this block was freed during this transaction, and can't be written */ #define BLOCK_FREED_HOLDER 3 /* used in flush_journal_list */ #define BLOCK_NEEDS_FLUSH 4 #define BLOCK_DIRTIED 5 /* journal list state bits */ #define LIST_TOUCHED 1 #define LIST_DIRTY 2 #define LIST_COMMIT_PENDING 4 /* someone will commit this list */ /* flags for do_journal_end */ #define FLUSH_ALL 1 /* flush commit and real blocks */ #define COMMIT_NOW 2 /* end and commit this transaction */ #define WAIT 4 /* wait for the log blocks to hit the disk */ static int do_journal_end(struct reiserfs_transaction_handle *, int flags); static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall); static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall); static int can_dirty(struct reiserfs_journal_cnode *cn); static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *sb); static void release_journal_dev(struct reiserfs_journal *journal); static void dirty_one_transaction(struct super_block *s, struct reiserfs_journal_list *jl); static void flush_async_commits(struct work_struct *work); static void queue_log_writer(struct super_block *s); /* values for join in do_journal_begin_r */ enum { JBEGIN_REG = 0, /* regular journal begin */ /* join the running transaction if at all possible */ JBEGIN_JOIN = 1, /* called from cleanup code, ignores aborted flag */ JBEGIN_ABORT = 2, }; static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block *sb, unsigned long nblocks, int join); static void init_journal_hash(struct super_block *sb) { struct reiserfs_journal *journal = SB_JOURNAL(sb); memset(journal->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); } /* * clears BH_Dirty and sticks the buffer on the clean list. Called because * I can't allow refile_buffer to make schedule happen after I've freed a * block. Look at remove_from_transaction and journal_mark_freed for * more details. */ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) { if (bh) { clear_buffer_dirty(bh); clear_buffer_journal_test(bh); } return 0; } static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block *sb) { struct reiserfs_bitmap_node *bn; static int id; bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS); if (!bn) { return NULL; } bn->data = kzalloc(sb->s_blocksize, GFP_NOFS); if (!bn->data) { kfree(bn); return NULL; } bn->id = id++; INIT_LIST_HEAD(&bn->list); return bn; } static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb) { struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_bitmap_node *bn = NULL; struct list_head *entry = journal->j_bitmap_nodes.next; journal->j_used_bitmap_nodes++; repeat: if (entry != &journal->j_bitmap_nodes) { bn = list_entry(entry, struct reiserfs_bitmap_node, list); list_del(entry); memset(bn->data, 0, sb->s_blocksize); journal->j_free_bitmap_nodes--; return bn; } bn = allocate_bitmap_node(sb); if (!bn) { yield(); goto repeat; } return bn; } static inline void free_bitmap_node(struct super_block *sb, struct reiserfs_bitmap_node *bn) { struct reiserfs_journal *journal = SB_JOURNAL(sb); journal->j_used_bitmap_nodes--; if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { kfree(bn->data); kfree(bn); } else { list_add(&bn->list, &journal->j_bitmap_nodes); journal->j_free_bitmap_nodes++; } } static void allocate_bitmap_nodes(struct super_block *sb) { int i; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_bitmap_node *bn = NULL; for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) { bn = allocate_bitmap_node(sb); if (bn) { list_add(&bn->list, &journal->j_bitmap_nodes); journal->j_free_bitmap_nodes++; } else { /* this is ok, we'll try again when more are needed */ break; } } } static int set_bit_in_list_bitmap(struct super_block *sb, b_blocknr_t block, struct reiserfs_list_bitmap *jb) { unsigned int bmap_nr = block / (sb->s_blocksize << 3); unsigned int bit_nr = block % (sb->s_blocksize << 3); if (!jb->bitmaps[bmap_nr]) { jb->bitmaps[bmap_nr] = get_bitmap_node(sb); } set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data); return 0; } static void cleanup_bitmap_list(struct super_block *sb, struct reiserfs_list_bitmap *jb) { int i; if (jb->bitmaps == NULL) return; for (i = 0; i < reiserfs_bmap_count(sb); i++) { if (jb->bitmaps[i]) { free_bitmap_node(sb, jb->bitmaps[i]); jb->bitmaps[i] = NULL; } } } /* * only call this on FS unmount. */ static int free_list_bitmaps(struct super_block *sb, struct reiserfs_list_bitmap *jb_array) { int i; struct reiserfs_list_bitmap *jb; for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { jb = jb_array + i; jb->journal_list = NULL; cleanup_bitmap_list(sb, jb); vfree(jb->bitmaps); jb->bitmaps = NULL; } return 0; } static int free_bitmap_nodes(struct super_block *sb) { struct reiserfs_journal *journal = SB_JOURNAL(sb); struct list_head *next = journal->j_bitmap_nodes.next; struct reiserfs_bitmap_node *bn; while (next != &journal->j_bitmap_nodes) { bn = list_entry(next, struct reiserfs_bitmap_node, list); list_del(next); kfree(bn->data); kfree(bn); next = journal->j_bitmap_nodes.next; journal->j_free_bitmap_nodes--; } return 0; } /* * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. * jb_array is the array to be filled in. */ int reiserfs_allocate_list_bitmaps(struct super_block *sb, struct reiserfs_list_bitmap *jb_array, unsigned int bmap_nr) { int i; int failed = 0; struct reiserfs_list_bitmap *jb; int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *); for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { jb = jb_array + i; jb->journal_list = NULL; jb->bitmaps = vzalloc(mem); if (!jb->bitmaps) { reiserfs_warning(sb, "clm-2000", "unable to " "allocate bitmaps for journal lists"); failed = 1; break; } } if (failed) { free_list_bitmaps(sb, jb_array); return -1; } return 0; } /* * find an available list bitmap. If you can't find one, flush a commit list * and try again */ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb, struct reiserfs_journal_list *jl) { int i, j; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_list_bitmap *jb = NULL; for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) { i = journal->j_list_bitmap_index; journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS; jb = journal->j_list_bitmap + i; if (journal->j_list_bitmap[i].journal_list) { flush_commit_list(sb, journal->j_list_bitmap[i]. journal_list, 1); if (!journal->j_list_bitmap[i].journal_list) { break; } } else { break; } } /* double check to make sure if flushed correctly */ if (jb->journal_list) return NULL; jb->journal_list = jl; return jb; } /* * allocates a new chunk of X nodes, and links them all together as a list. * Uses the cnode->next and cnode->prev pointers * returns NULL on failure */ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) { struct reiserfs_journal_cnode *head; int i; if (num_cnodes <= 0) { return NULL; } head = vzalloc(array_size(num_cnodes, sizeof(struct reiserfs_journal_cnode))); if (!head) { return NULL; } head[0].prev = NULL; head[0].next = head + 1; for (i = 1; i < num_cnodes; i++) { head[i].prev = head + (i - 1); head[i].next = head + (i + 1); /* if last one, overwrite it after the if */ } head[num_cnodes - 1].next = NULL; return head; } /* pulls a cnode off the free list, or returns NULL on failure */ static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb) { struct reiserfs_journal_cnode *cn; struct reiserfs_journal *journal = SB_JOURNAL(sb); reiserfs_check_lock_depth(sb, "get_cnode"); if (journal->j_cnode_free <= 0) { return NULL; } journal->j_cnode_used++; journal->j_cnode_free--; cn = journal->j_cnode_free_list; if (!cn) { return cn; } if (cn->next) { cn->next->prev = NULL; } journal->j_cnode_free_list = cn->next; memset(cn, 0, sizeof(struct reiserfs_journal_cnode)); return cn; } /* * returns a cnode to the free list */ static void free_cnode(struct super_block *sb, struct reiserfs_journal_cnode *cn) { struct reiserfs_journal *journal = SB_JOURNAL(sb); reiserfs_check_lock_depth(sb, "free_cnode"); journal->j_cnode_used--; journal->j_cnode_free++; /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */ cn->next = journal->j_cnode_free_list; if (journal->j_cnode_free_list) { journal->j_cnode_free_list->prev = cn; } cn->prev = NULL; /* not needed with the memset, but I might kill the memset, and forget to do this */ journal->j_cnode_free_list = cn; } static void clear_prepared_bits(struct buffer_head *bh) { clear_buffer_journal_prepared(bh); clear_buffer_journal_restore_dirty(bh); } /* * return a cnode with same dev, block number and size in table, * or null if not found */ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct super_block *sb, struct reiserfs_journal_cnode **table, long bl) { struct reiserfs_journal_cnode *cn; cn = journal_hash(table, sb, bl); while (cn) { if (cn->blocknr == bl && cn->sb == sb) return cn; cn = cn->hnext; } return (struct reiserfs_journal_cnode *)0; } /* * this actually means 'can this block be reallocated yet?'. If you set * search_all, a block can only be allocated if it is not in the current * transaction, was not freed by the current transaction, and has no chance * of ever being overwritten by a replay after crashing. * * If you don't set search_all, a block can only be allocated if it is not * in the current transaction. Since deleting a block removes it from the * current transaction, this case should never happen. If you don't set * search_all, make sure you never write the block without logging it. * * next_zero_bit is a suggestion about the next block to try for find_forward. * when bl is rejected because it is set in a journal list bitmap, we search * for the next zero bit in the bitmap that rejected bl. Then, we return * that through next_zero_bit for find_forward to try. * * Just because we return something in next_zero_bit does not mean we won't * reject it on the next call to reiserfs_in_journal */ int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr, int bit_nr, int search_all, b_blocknr_t * next_zero_bit) { struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_list_bitmap *jb; int i; unsigned long bl; *next_zero_bit = 0; /* always start this at zero. */ PROC_INFO_INC(sb, journal.in_journal); /* * If we aren't doing a search_all, this is a metablock, and it * will be logged before use. if we crash before the transaction * that freed it commits, this transaction won't have committed * either, and the block will never be written */ if (search_all) { for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { PROC_INFO_INC(sb, journal.in_journal_bitmap); jb = journal->j_list_bitmap + i; if (jb->journal_list && jb->bitmaps[bmap_nr] && test_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]-> data)) { *next_zero_bit = find_next_zero_bit((unsigned long *) (jb->bitmaps[bmap_nr]-> data), sb->s_blocksize << 3, bit_nr + 1); return 1; } } } bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr; /* is it in any old transactions? */ if (search_all && (get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) { return 1; } /* is it in the current transaction. This should never happen */ if ((get_journal_hash_dev(sb, journal->j_hash_table, bl))) { BUG(); return 1; } PROC_INFO_INC(sb, journal.in_journal_reusable); /* safe for reuse */ return 0; } /* insert cn into table */ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct reiserfs_journal_cnode *cn) { struct reiserfs_journal_cnode *cn_orig; cn_orig = journal_hash(table, cn->sb, cn->blocknr); cn->hnext = cn_orig; cn->hprev = NULL; if (cn_orig) { cn_orig->hprev = cn; } journal_hash(table, cn->sb, cn->blocknr) = cn; } /* lock the current transaction */ static inline void lock_journal(struct super_block *sb) { PROC_INFO_INC(sb, journal.lock_journal); reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb); } /* unlock the current transaction */ static inline void unlock_journal(struct super_block *sb) { mutex_unlock(&SB_JOURNAL(sb)->j_mutex); } static inline void get_journal_list(struct reiserfs_journal_list *jl) { jl->j_refcount++; } static inline void put_journal_list(struct super_block *s, struct reiserfs_journal_list *jl) { if (jl->j_refcount < 1) { reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d", jl->j_trans_id, jl->j_refcount); } if (--jl->j_refcount == 0) kfree(jl); } /* * this used to be much more involved, and I'm keeping it just in case * things get ugly again. it gets called by flush_commit_list, and * cleans up any data stored about blocks freed during a transaction. */ static void cleanup_freed_for_journal_list(struct super_block *sb, struct reiserfs_journal_list *jl) { struct reiserfs_list_bitmap *jb = jl->j_list_bitmap; if (jb) { cleanup_bitmap_list(sb, jb); } jl->j_list_bitmap->journal_list = NULL; jl->j_list_bitmap = NULL; } static int journal_list_still_alive(struct super_block *s, unsigned int trans_id) { struct reiserfs_journal *journal = SB_JOURNAL(s); struct list_head *entry = &journal->j_journal_list; struct reiserfs_journal_list *jl; if (!list_empty(entry)) { jl = JOURNAL_LIST_ENTRY(entry->next); if (jl->j_trans_id <= trans_id) { return 1; } } return 0; } /* * If page->mapping was null, we failed to truncate this page for * some reason. Most likely because it was truncated after being * logged via data=journal. * * This does a check to see if the buffer belongs to one of these * lost pages before doing the final put_bh. If page->mapping was * null, it tries to free buffers on the page, which should make the * final put_page drop the page from the lru. */ static void release_buffer_page(struct buffer_head *bh) { struct folio *folio = bh->b_folio; if (!folio->mapping && folio_trylock(folio)) { folio_get(folio); put_bh(bh); if (!folio->mapping) try_to_free_buffers(folio); folio_unlock(folio); folio_put(folio); } else { put_bh(bh); } } static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { if (buffer_journaled(bh)) { reiserfs_warning(NULL, "clm-2084", "pinned buffer %lu:%pg sent to disk", bh->b_blocknr, bh->b_bdev); } if (uptodate) set_buffer_uptodate(bh); else clear_buffer_uptodate(bh); unlock_buffer(bh); release_buffer_page(bh); } static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) { if (uptodate) set_buffer_uptodate(bh); else clear_buffer_uptodate(bh); unlock_buffer(bh); put_bh(bh); } static void submit_logged_buffer(struct buffer_head *bh) { get_bh(bh); bh->b_end_io = reiserfs_end_buffer_io_sync; clear_buffer_journal_new(bh); clear_buffer_dirty(bh); if (!test_clear_buffer_journal_test(bh)) BUG(); if (!buffer_uptodate(bh)) BUG(); submit_bh(REQ_OP_WRITE, bh); } static void submit_ordered_buffer(struct buffer_head *bh) { get_bh(bh); bh->b_end_io = reiserfs_end_ordered_io; clear_buffer_dirty(bh); if (!buffer_uptodate(bh)) BUG(); submit_bh(REQ_OP_WRITE, bh); } #define CHUNK_SIZE 32 struct buffer_chunk { struct buffer_head *bh[CHUNK_SIZE]; int nr; }; static void write_chunk(struct buffer_chunk *chunk) { int i; for (i = 0; i < chunk->nr; i++) { submit_logged_buffer(chunk->bh[i]); } chunk->nr = 0; } static void write_ordered_chunk(struct buffer_chunk *chunk) { int i; for (i = 0; i < chunk->nr; i++) { submit_ordered_buffer(chunk->bh[i]); } chunk->nr = 0; } static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, spinlock_t * lock, void (fn) (struct buffer_chunk *)) { int ret = 0; BUG_ON(chunk->nr >= CHUNK_SIZE); chunk->bh[chunk->nr++] = bh; if (chunk->nr >= CHUNK_SIZE) { ret = 1; if (lock) { spin_unlock(lock); fn(chunk); spin_lock(lock); } else { fn(chunk); } } return ret; } static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0); static struct reiserfs_jh *alloc_jh(void) { struct reiserfs_jh *jh; while (1) { jh = kmalloc(sizeof(*jh), GFP_NOFS); if (jh) { atomic_inc(&nr_reiserfs_jh); return jh; } yield(); } } /* * we want to free the jh when the buffer has been written * and waited on */ void reiserfs_free_jh(struct buffer_head *bh) { struct reiserfs_jh *jh; jh = bh->b_private; if (jh) { bh->b_private = NULL; jh->bh = NULL; list_del_init(&jh->list); kfree(jh); if (atomic_read(&nr_reiserfs_jh) <= 0) BUG(); atomic_dec(&nr_reiserfs_jh); put_bh(bh); } } static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, int tail) { struct reiserfs_jh *jh; if (bh->b_private) { spin_lock(&j->j_dirty_buffers_lock); if (!bh->b_private) { spin_unlock(&j->j_dirty_buffers_lock); goto no_jh; } jh = bh->b_private; list_del_init(&jh->list); } else { no_jh: get_bh(bh); jh = alloc_jh(); spin_lock(&j->j_dirty_buffers_lock); /* * buffer must be locked for __add_jh, should be able to have * two adds at the same time */ BUG_ON(bh->b_private); jh->bh = bh; bh->b_private = jh; } jh->jl = j->j_current_jl; if (tail) list_add_tail(&jh->list, &jh->jl->j_tail_bh_list); else { list_add_tail(&jh->list, &jh->jl->j_bh_list); } spin_unlock(&j->j_dirty_buffers_lock); return 0; } int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) { return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1); } int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) { return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0); } #define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list) static int write_ordered_buffers(spinlock_t * lock, struct reiserfs_journal *j, struct reiserfs_journal_list *jl, struct list_head *list) { struct buffer_head *bh; struct reiserfs_jh *jh; int ret = j->j_errno; struct buffer_chunk chunk; struct list_head tmp; INIT_LIST_HEAD(&tmp); chunk.nr = 0; spin_lock(lock); while (!list_empty(list)) { jh = JH_ENTRY(list->next); bh = jh->bh; get_bh(bh); if (!trylock_buffer(bh)) { if (!buffer_dirty(bh)) { list_move(&jh->list, &tmp); goto loop_next; } spin_unlock(lock); if (chunk.nr) write_ordered_chunk(&chunk); wait_on_buffer(bh); cond_resched(); spin_lock(lock); goto loop_next; } /* * in theory, dirty non-uptodate buffers should never get here, * but the upper layer io error paths still have a few quirks. * Handle them here as gracefully as we can */ if (!buffer_uptodate(bh) && buffer_dirty(bh)) { clear_buffer_dirty(bh); ret = -EIO; } if (buffer_dirty(bh)) { list_move(&jh->list, &tmp); add_to_chunk(&chunk, bh, lock, write_ordered_chunk); } else { reiserfs_free_jh(bh); unlock_buffer(bh); } loop_next: put_bh(bh); cond_resched_lock(lock); } if (chunk.nr) { spin_unlock(lock); write_ordered_chunk(&chunk); spin_lock(lock); } while (!list_empty(&tmp)) { jh = JH_ENTRY(tmp.prev); bh = jh->bh; get_bh(bh); reiserfs_free_jh(bh); if (buffer_locked(bh)) { spin_unlock(lock); wait_on_buffer(bh); spin_lock(lock); } if (!buffer_uptodate(bh)) { ret = -EIO; } /* * ugly interaction with invalidate_folio here. * reiserfs_invalidate_folio will pin any buffer that has a * valid journal head from an older transaction. If someone * else sets our buffer dirty after we write it in the first * loop, and then someone truncates the page away, nobody * will ever write the buffer. We're safe if we write the * page one last time after freeing the journal header. */ if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) { spin_unlock(lock); write_dirty_buffer(bh, 0); spin_lock(lock); } put_bh(bh); cond_resched_lock(lock); } spin_unlock(lock); return ret; } static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) { struct reiserfs_journal *journal = SB_JOURNAL(s); struct reiserfs_journal_list *other_jl; struct reiserfs_journal_list *first_jl; struct list_head *entry; unsigned int trans_id = jl->j_trans_id; unsigned int other_trans_id; find_first: /* * first we walk backwards to find the oldest uncommitted transation */ first_jl = jl; entry = jl->j_list.prev; while (1) { other_jl = JOURNAL_LIST_ENTRY(entry); if (entry == &journal->j_journal_list || atomic_read(&other_jl->j_older_commits_done)) break; first_jl = other_jl; entry = other_jl->j_list.prev; } /* if we didn't find any older uncommitted transactions, return now */ if (first_jl == jl) { return 0; } entry = &first_jl->j_list; while (1) { other_jl = JOURNAL_LIST_ENTRY(entry); other_trans_id = other_jl->j_trans_id; if (other_trans_id < trans_id) { if (atomic_read(&other_jl->j_commit_left) != 0) { flush_commit_list(s, other_jl, 0); /* list we were called with is gone, return */ if (!journal_list_still_alive(s, trans_id)) return 1; /* * the one we just flushed is gone, this means * all older lists are also gone, so first_jl * is no longer valid either. Go back to the * beginning. */ if (!journal_list_still_alive (s, other_trans_id)) { goto find_first; } } entry = entry->next; if (entry == &journal->j_journal_list) return 0; } else { return 0; } } return 0; } static int reiserfs_async_progress_wait(struct super_block *s) { struct reiserfs_journal *j = SB_JOURNAL(s); if (atomic_read(&j->j_async_throttle)) { int depth; depth = reiserfs_write_unlock_nested(s); wait_var_event_timeout(&j->j_async_throttle, atomic_read(&j->j_async_throttle) == 0, HZ / 10); reiserfs_write_lock_nested(s, depth); } return 0; } /* * if this journal list still has commit blocks unflushed, send them to disk. * * log areas must be flushed in order (transaction 2 can't commit before * transaction 1) Before the commit block can by written, every other log * block must be safely on disk */ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { int i; b_blocknr_t bn; struct buffer_head *tbh = NULL; unsigned int trans_id = jl->j_trans_id; struct reiserfs_journal *journal = SB_JOURNAL(s); int retval = 0; int write_len; int depth; reiserfs_check_lock_depth(s, "flush_commit_list"); if (atomic_read(&jl->j_older_commits_done)) { return 0; } /* * before we can put our commit blocks on disk, we have to make * sure everyone older than us is on disk too */ BUG_ON(jl->j_len <= 0); BUG_ON(trans_id == journal->j_trans_id); get_journal_list(jl); if (flushall) { if (flush_older_commits(s, jl) == 1) { /* * list disappeared during flush_older_commits. * return */ goto put_jl; } } /* make sure nobody is trying to flush this one at the same time */ reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s); if (!journal_list_still_alive(s, trans_id)) { mutex_unlock(&jl->j_commit_mutex); goto put_jl; } BUG_ON(jl->j_trans_id == 0); /* this commit is done, exit */ if (atomic_read(&jl->j_commit_left) <= 0) { if (flushall) { atomic_set(&jl->j_older_commits_done, 1); } mutex_unlock(&jl->j_commit_mutex); goto put_jl; } if (!list_empty(&jl->j_bh_list)) { int ret; /* * We might sleep in numerous places inside * write_ordered_buffers. Relax the write lock. */ depth = reiserfs_write_unlock_nested(s); ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, journal, jl, &jl->j_bh_list); if (ret < 0 && retval == 0) retval = ret; reiserfs_write_lock_nested(s, depth); } BUG_ON(!list_empty(&jl->j_bh_list)); /* * for the description block and all the log blocks, submit any buffers * that haven't already reached the disk. Try to write at least 256 * log blocks. later on, we will only wait on blocks that correspond * to this transaction, but while we're unplugging we might as well * get a chunk of data on there. */ atomic_inc(&journal->j_async_throttle); write_len = jl->j_len + 1; if (write_len < 256) write_len = 256; for (i = 0 ; i < write_len ; i++) { bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn); if (tbh) { if (buffer_dirty(tbh)) { depth = reiserfs_write_unlock_nested(s); write_dirty_buffer(tbh, 0); reiserfs_write_lock_nested(s, depth); } put_bh(tbh) ; } } if (atomic_dec_and_test(&journal->j_async_throttle)) wake_up_var(&journal->j_async_throttle); for (i = 0; i < (jl->j_len + 1); i++) { bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn); depth = reiserfs_write_unlock_nested(s); __wait_on_buffer(tbh); reiserfs_write_lock_nested(s, depth); /* * since we're using ll_rw_blk above, it might have skipped * over a locked buffer. Double check here */ /* redundant, sync_dirty_buffer() checks */ if (buffer_dirty(tbh)) { depth = reiserfs_write_unlock_nested(s); sync_dirty_buffer(tbh); reiserfs_write_lock_nested(s, depth); } if (unlikely(!buffer_uptodate(tbh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(s, "journal-601", "buffer write failed"); #endif retval = -EIO; } /* once for journal_find_get_block */ put_bh(tbh); /* once due to original getblk in do_journal_end */ put_bh(tbh); atomic_dec(&jl->j_commit_left); } BUG_ON(atomic_read(&jl->j_commit_left) != 1); /* * If there was a write error in the journal - we can't commit * this transaction - it will be invalid and, if successful, * will just end up propagating the write error out to * the file system. */ if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { if (buffer_dirty(jl->j_commit_bh)) BUG(); mark_buffer_dirty(jl->j_commit_bh) ; depth = reiserfs_write_unlock_nested(s); if (reiserfs_barrier_flush(s)) __sync_dirty_buffer(jl->j_commit_bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(jl->j_commit_bh); reiserfs_write_lock_nested(s, depth); } /* * If there was a write error in the journal - we can't commit this * transaction - it will be invalid and, if successful, will just end * up propagating the write error out to the filesystem. */ if (unlikely(!buffer_uptodate(jl->j_commit_bh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(s, "journal-615", "buffer write failed"); #endif retval = -EIO; } bforget(jl->j_commit_bh); if (journal->j_last_commit_id != 0 && (jl->j_trans_id - journal->j_last_commit_id) != 1) { reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu", journal->j_last_commit_id, jl->j_trans_id); } journal->j_last_commit_id = jl->j_trans_id; /* * now, every commit block is on the disk. It is safe to allow * blocks freed during this transaction to be reallocated */ cleanup_freed_for_journal_list(s, jl); retval = retval ? retval : journal->j_errno; /* mark the metadata dirty */ if (!retval) dirty_one_transaction(s, jl); atomic_dec(&jl->j_commit_left); if (flushall) { atomic_set(&jl->j_older_commits_done, 1); } mutex_unlock(&jl->j_commit_mutex); put_jl: put_journal_list(s, jl); if (retval) reiserfs_abort(s, retval, "Journal write error in %s", __func__); return retval; } /* * flush_journal_list frequently needs to find a newer transaction for a * given block. This does that, or returns NULL if it can't find anything */ static struct reiserfs_journal_list *find_newer_jl_for_cn(struct reiserfs_journal_cnode *cn) { struct super_block *sb = cn->sb; b_blocknr_t blocknr = cn->blocknr; cn = cn->hprev; while (cn) { if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) { return cn->jlist; } cn = cn->hprev; } return NULL; } static void remove_journal_hash(struct super_block *, struct reiserfs_journal_cnode **, struct reiserfs_journal_list *, unsigned long, int); /* * once all the real blocks have been flushed, it is safe to remove them * from the journal list for this transaction. Aside from freeing the * cnode, this also allows the block to be reallocated for data blocks * if it had been deleted. */ static void remove_all_from_journal_list(struct super_block *sb, struct reiserfs_journal_list *jl, int debug) { struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_journal_cnode *cn, *last; cn = jl->j_realblock; /* * which is better, to lock once around the whole loop, or * to lock for each call to remove_journal_hash? */ while (cn) { if (cn->blocknr != 0) { if (debug) { reiserfs_warning(sb, "reiserfs-2201", "block %u, bh is %d, state %ld", cn->blocknr, cn->bh ? 1 : 0, cn->state); } cn->state = 0; remove_journal_hash(sb, journal->j_list_hash_table, jl, cn->blocknr, 1); } last = cn; cn = cn->next; free_cnode(sb, last); } jl->j_realblock = NULL; } /* * if this timestamp is greater than the timestamp we wrote last to the * header block, write it to the header block. once this is done, I can * safely say the log area for this transaction won't ever be replayed, * and I can start releasing blocks in this transaction for reuse as data * blocks. called by flush_journal_list, before it calls * remove_all_from_journal_list */ static int _update_journal_header_block(struct super_block *sb, unsigned long offset, unsigned int trans_id) { struct reiserfs_journal_header *jh; struct reiserfs_journal *journal = SB_JOURNAL(sb); int depth; if (reiserfs_is_journal_aborted(journal)) return -EIO; if (trans_id >= journal->j_last_flush_trans_id) { if (buffer_locked((journal->j_header_bh))) { depth = reiserfs_write_unlock_nested(sb); __wait_on_buffer(journal->j_header_bh); reiserfs_write_lock_nested(sb, depth); if (unlikely(!buffer_uptodate(journal->j_header_bh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(sb, "journal-699", "buffer write failed"); #endif return -EIO; } } journal->j_last_flush_trans_id = trans_id; journal->j_first_unflushed_offset = offset; jh = (struct reiserfs_journal_header *)(journal->j_header_bh-> b_data); jh->j_last_flush_trans_id = cpu_to_le32(trans_id); jh->j_first_unflushed_offset = cpu_to_le32(offset); jh->j_mount_id = cpu_to_le32(journal->j_mount_id); set_buffer_dirty(journal->j_header_bh); depth = reiserfs_write_unlock_nested(sb); if (reiserfs_barrier_flush(sb)) __sync_dirty_buffer(journal->j_header_bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(journal->j_header_bh); reiserfs_write_lock_nested(sb, depth); if (!buffer_uptodate(journal->j_header_bh)) { reiserfs_warning(sb, "journal-837", "IO error during journal replay"); return -EIO; } } return 0; } static int update_journal_header_block(struct super_block *sb, unsigned long offset, unsigned int trans_id) { return _update_journal_header_block(sb, offset, trans_id); } /* ** flush any and all journal lists older than you are ** can only be called from flush_journal_list */ static int flush_older_journal_lists(struct super_block *sb, struct reiserfs_journal_list *jl) { struct list_head *entry; struct reiserfs_journal_list *other_jl; struct reiserfs_journal *journal = SB_JOURNAL(sb); unsigned int trans_id = jl->j_trans_id; /* * we know we are the only ones flushing things, no extra race * protection is required. */ restart: entry = journal->j_journal_list.next; /* Did we wrap? */ if (entry == &journal->j_journal_list) return 0; other_jl = JOURNAL_LIST_ENTRY(entry); if (other_jl->j_trans_id < trans_id) { BUG_ON(other_jl->j_refcount <= 0); /* do not flush all */ flush_journal_list(sb, other_jl, 0); /* other_jl is now deleted from the list */ goto restart; } return 0; } static void del_from_work_list(struct super_block *s, struct reiserfs_journal_list *jl) { struct reiserfs_journal *journal = SB_JOURNAL(s); if (!list_empty(&jl->j_working_list)) { list_del_init(&jl->j_working_list); journal->j_num_work_lists--; } } /* * flush a journal list, both commit and real blocks * * always set flushall to 1, unless you are calling from inside * flush_journal_list * * IMPORTANT. This can only be called while there are no journal writers, * and the journal is locked. That means it can only be called from * do_journal_end, or by journal_release */ static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { struct reiserfs_journal_list *pjl; struct reiserfs_journal_cnode *cn; int count; int was_jwait = 0; int was_dirty = 0; struct buffer_head *saved_bh; unsigned long j_len_saved = jl->j_len; struct reiserfs_journal *journal = SB_JOURNAL(s); int err = 0; int depth; BUG_ON(j_len_saved <= 0); if (atomic_read(&journal->j_wcount) != 0) { reiserfs_warning(s, "clm-2048", "called with wcount %d", atomic_read(&journal->j_wcount)); } /* if flushall == 0, the lock is already held */ if (flushall) { reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); } else if (mutex_trylock(&journal->j_flush_mutex)) { BUG(); } count = 0; if (j_len_saved > journal->j_trans_max) { reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu", j_len_saved, jl->j_trans_id); return 0; } /* if all the work is already done, get out of here */ if (atomic_read(&jl->j_nonzerolen) <= 0 && atomic_read(&jl->j_commit_left) <= 0) { goto flush_older_and_return; } /* * start by putting the commit list on disk. This will also flush * the commit lists of any olders transactions */ flush_commit_list(s, jl, 1); if (!(jl->j_state & LIST_DIRTY) && !reiserfs_is_journal_aborted(journal)) BUG(); /* are we done now? */ if (atomic_read(&jl->j_nonzerolen) <= 0 && atomic_read(&jl->j_commit_left) <= 0) { goto flush_older_and_return; } /* * loop through each cnode, see if we need to write it, * or wait on a more recent transaction, or just ignore it */ if (atomic_read(&journal->j_wcount) != 0) { reiserfs_panic(s, "journal-844", "journal list is flushing, " "wcount is not 0"); } cn = jl->j_realblock; while (cn) { was_jwait = 0; was_dirty = 0; saved_bh = NULL; /* blocknr of 0 is no longer in the hash, ignore it */ if (cn->blocknr == 0) { goto free_cnode; } /* * This transaction failed commit. * Don't write out to the disk */ if (!(jl->j_state & LIST_DIRTY)) goto free_cnode; pjl = find_newer_jl_for_cn(cn); /* * the order is important here. We check pjl to make sure we * don't clear BH_JDirty_wait if we aren't the one writing this * block to disk */ if (!pjl && cn->bh) { saved_bh = cn->bh; /* * we do this to make sure nobody releases the * buffer while we are working with it */ get_bh(saved_bh); if (buffer_journal_dirty(saved_bh)) { BUG_ON(!can_dirty(cn)); was_jwait = 1; was_dirty = 1; } else if (can_dirty(cn)) { /* * everything with !pjl && jwait * should be writable */ BUG(); } } /* * if someone has this block in a newer transaction, just make * sure they are committed, and don't try writing it to disk */ if (pjl) { if (atomic_read(&pjl->j_commit_left)) flush_commit_list(s, pjl, 1); goto free_cnode; } /* * bh == NULL when the block got to disk on its own, OR, * the block got freed in a future transaction */ if (saved_bh == NULL) { goto free_cnode; } /* * this should never happen. kupdate_one_transaction has * this list locked while it works, so we should never see a * buffer here that is not marked JDirty_wait */ if ((!was_jwait) && !buffer_locked(saved_bh)) { reiserfs_warning(s, "journal-813", "BAD! buffer %llu %cdirty %cjwait, " "not in a newer transaction", (unsigned long long)saved_bh-> b_blocknr, was_dirty ? ' ' : '!', was_jwait ? ' ' : '!'); } if (was_dirty) { /* * we inc again because saved_bh gets decremented * at free_cnode */ get_bh(saved_bh); set_bit(BLOCK_NEEDS_FLUSH, &cn->state); lock_buffer(saved_bh); BUG_ON(cn->blocknr != saved_bh->b_blocknr); if (buffer_dirty(saved_bh)) submit_logged_buffer(saved_bh); else unlock_buffer(saved_bh); count++; } else { reiserfs_warning(s, "clm-2082", "Unable to flush buffer %llu in %s", (unsigned long long)saved_bh-> b_blocknr, __func__); } free_cnode: cn = cn->next; if (saved_bh) { /* * we incremented this to keep others from * taking the buffer head away */ put_bh(saved_bh); if (atomic_read(&saved_bh->b_count) < 0) { reiserfs_warning(s, "journal-945", "saved_bh->b_count < 0"); } } } if (count > 0) { cn = jl->j_realblock; while (cn) { if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { if (!cn->bh) { reiserfs_panic(s, "journal-1011", "cn->bh is NULL"); } depth = reiserfs_write_unlock_nested(s); __wait_on_buffer(cn->bh); reiserfs_write_lock_nested(s, depth); if (!cn->bh) { reiserfs_panic(s, "journal-1012", "cn->bh is NULL"); } if (unlikely(!buffer_uptodate(cn->bh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(s, "journal-949", "buffer write failed"); #endif err = -EIO; } /* * note, we must clear the JDirty_wait bit * after the up to date check, otherwise we * race against our flushpage routine */ BUG_ON(!test_clear_buffer_journal_dirty (cn->bh)); /* drop one ref for us */ put_bh(cn->bh); /* drop one ref for journal_mark_dirty */ release_buffer_page(cn->bh); } cn = cn->next; } } if (err) reiserfs_abort(s, -EIO, "Write error while pushing transaction to disk in %s", __func__); flush_older_and_return: /* * before we can update the journal header block, we _must_ flush all * real blocks from all older transactions to disk. This is because * once the header block is updated, this transaction will not be * replayed after a crash */ if (flushall) { flush_older_journal_lists(s, jl); } err = journal->j_errno; /* * before we can remove everything from the hash tables for this * transaction, we must make sure it can never be replayed * * since we are only called from do_journal_end, we know for sure there * are no allocations going on while we are flushing journal lists. So, * we only need to update the journal header block for the last list * being flushed */ if (!err && flushall) { err = update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id); if (err) reiserfs_abort(s, -EIO, "Write error while updating journal header in %s", __func__); } remove_all_from_journal_list(s, jl, 0); list_del_init(&jl->j_list); journal->j_num_lists--; del_from_work_list(s, jl); if (journal->j_last_flush_id != 0 && (jl->j_trans_id - journal->j_last_flush_id) != 1) { reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu", journal->j_last_flush_id, jl->j_trans_id); } journal->j_last_flush_id = jl->j_trans_id; /* * not strictly required since we are freeing the list, but it should * help find code using dead lists later on */ jl->j_len = 0; atomic_set(&jl->j_nonzerolen, 0); jl->j_start = 0; jl->j_realblock = NULL; jl->j_commit_bh = NULL; jl->j_trans_id = 0; jl->j_state = 0; put_journal_list(s, jl); if (flushall) mutex_unlock(&journal->j_flush_mutex); return err; } static int write_one_transaction(struct super_block *s, struct reiserfs_journal_list *jl, struct buffer_chunk *chunk) { struct reiserfs_journal_cnode *cn; int ret = 0; jl->j_state |= LIST_TOUCHED; del_from_work_list(s, jl); if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) { return 0; } cn = jl->j_realblock; while (cn) { /* * if the blocknr == 0, this has been cleared from the hash, * skip it */ if (cn->blocknr == 0) { goto next; } if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { struct buffer_head *tmp_bh; /* * we can race against journal_mark_freed when we try * to lock_buffer(cn->bh), so we have to inc the buffer * count, and recheck things after locking */ tmp_bh = cn->bh; get_bh(tmp_bh); lock_buffer(tmp_bh); if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) { if (!buffer_journal_dirty(tmp_bh) || buffer_journal_prepared(tmp_bh)) BUG(); add_to_chunk(chunk, tmp_bh, NULL, write_chunk); ret++; } else { /* note, cn->bh might be null now */ unlock_buffer(tmp_bh); } put_bh(tmp_bh); } next: cn = cn->next; cond_resched(); } return ret; } /* used by flush_commit_list */ static void dirty_one_transaction(struct super_block *s, struct reiserfs_journal_list *jl) { struct reiserfs_journal_cnode *cn; struct reiserfs_journal_list *pjl; jl->j_state |= LIST_DIRTY; cn = jl->j_realblock; while (cn) { /* * look for a more recent transaction that logged this * buffer. Only the most recent transaction with a buffer in * it is allowed to send that buffer to disk */ pjl = find_newer_jl_for_cn(cn); if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh)) { BUG_ON(!can_dirty(cn)); /* * if the buffer is prepared, it will either be logged * or restored. If restored, we need to make sure * it actually gets marked dirty */ clear_buffer_journal_new(cn->bh); if (buffer_journal_prepared(cn->bh)) { set_buffer_journal_restore_dirty(cn->bh); } else { set_buffer_journal_test(cn->bh); mark_buffer_dirty(cn->bh); } } cn = cn->next; } } static int kupdate_transactions(struct super_block *s, struct reiserfs_journal_list *jl, struct reiserfs_journal_list **next_jl, unsigned int *next_trans_id, int num_blocks, int num_trans) { int ret = 0; int written = 0; int transactions_flushed = 0; unsigned int orig_trans_id = jl->j_trans_id; struct buffer_chunk chunk; struct list_head *entry; struct reiserfs_journal *journal = SB_JOURNAL(s); chunk.nr = 0; reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); if (!journal_list_still_alive(s, orig_trans_id)) { goto done; } /* * we've got j_flush_mutex held, nobody is going to delete any * of these lists out from underneath us */ while ((num_trans && transactions_flushed < num_trans) || (!num_trans && written < num_blocks)) { if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) || atomic_read(&jl->j_commit_left) || !(jl->j_state & LIST_DIRTY)) { del_from_work_list(s, jl); break; } ret = write_one_transaction(s, jl, &chunk); if (ret < 0) goto done; transactions_flushed++; written += ret; entry = jl->j_list.next; /* did we wrap? */ if (entry == &journal->j_journal_list) { break; } jl = JOURNAL_LIST_ENTRY(entry); /* don't bother with older transactions */ if (jl->j_trans_id <= orig_trans_id) break; } if (chunk.nr) { write_chunk(&chunk); } done: mutex_unlock(&journal->j_flush_mutex); return ret; } /* * for o_sync and fsync heavy applications, they tend to use * all the journa list slots with tiny transactions. These * trigger lots and lots of calls to update the header block, which * adds seeks and slows things down. * * This function tries to clear out a large chunk of the journal lists * at once, which makes everything faster since only the newest journal * list updates the header block */ static int flush_used_journal_lists(struct super_block *s, struct reiserfs_journal_list *jl) { unsigned long len = 0; unsigned long cur_len; int i; int limit = 256; struct reiserfs_journal_list *tjl; struct reiserfs_journal_list *flush_jl; unsigned int trans_id; struct reiserfs_journal *journal = SB_JOURNAL(s); flush_jl = tjl = jl; /* in data logging mode, try harder to flush a lot of blocks */ if (reiserfs_data_log(s)) limit = 1024; /* flush for 256 transactions or limit blocks, whichever comes first */ for (i = 0; i < 256 && len < limit; i++) { if (atomic_read(&tjl->j_commit_left) || tjl->j_trans_id < jl->j_trans_id) { break; } cur_len = atomic_read(&tjl->j_nonzerolen); if (cur_len > 0) { tjl->j_state &= ~LIST_TOUCHED; } len += cur_len; flush_jl = tjl; if (tjl->j_list.next == &journal->j_journal_list) break; tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); } get_journal_list(jl); get_journal_list(flush_jl); /* * try to find a group of blocks we can flush across all the * transactions, but only bother if we've actually spanned * across multiple lists */ if (flush_jl != jl) kupdate_transactions(s, jl, &tjl, &trans_id, len, i); flush_journal_list(s, flush_jl, 1); put_journal_list(s, flush_jl); put_journal_list(s, jl); return 0; } /* * removes any nodes in table with name block and dev as bh. * only touchs the hnext and hprev pointers. */ static void remove_journal_hash(struct super_block *sb, struct reiserfs_journal_cnode **table, struct reiserfs_journal_list *jl, unsigned long block, int remove_freed) { struct reiserfs_journal_cnode *cur; struct reiserfs_journal_cnode **head; head = &(journal_hash(table, sb, block)); if (!head) { return; } cur = *head; while (cur) { if (cur->blocknr == block && cur->sb == sb && (jl == NULL || jl == cur->jlist) && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) { if (cur->hnext) { cur->hnext->hprev = cur->hprev; } if (cur->hprev) { cur->hprev->hnext = cur->hnext; } else { *head = cur->hnext; } cur->blocknr = 0; cur->sb = NULL; cur->state = 0; /* * anybody who clears the cur->bh will also * dec the nonzerolen */ if (cur->bh && cur->jlist) atomic_dec(&cur->jlist->j_nonzerolen); cur->bh = NULL; cur->jlist = NULL; } cur = cur->hnext; } } static void free_journal_ram(struct super_block *sb) { struct reiserfs_journal *journal = SB_JOURNAL(sb); kfree(journal->j_current_jl); journal->j_num_lists--; vfree(journal->j_cnode_free_orig); free_list_bitmaps(sb, journal->j_list_bitmap); free_bitmap_nodes(sb); /* must be after free_list_bitmaps */ if (journal->j_header_bh) { brelse(journal->j_header_bh); } /* * j_header_bh is on the journal dev, make sure * not to release the journal dev until we brelse j_header_bh */ release_journal_dev(journal); vfree(journal); } /* * call on unmount. Only set error to 1 if you haven't made your way out * of read_super() yet. Any other caller must keep error at 0. */ static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *sb, int error) { struct reiserfs_transaction_handle myth; struct reiserfs_journal *journal = SB_JOURNAL(sb); /* * we only want to flush out transactions if we were * called with error == 0 */ if (!error && !sb_rdonly(sb)) { /* end the current trans */ BUG_ON(!th->t_trans_id); do_journal_end(th, FLUSH_ALL); /* * make sure something gets logged to force * our way into the flush code */ if (!journal_join(&myth, sb)) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); do_journal_end(&myth, FLUSH_ALL); } } /* this also catches errors during the do_journal_end above */ if (!error && reiserfs_is_journal_aborted(journal)) { memset(&myth, 0, sizeof(myth)); if (!journal_join_abort(&myth, sb)) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); do_journal_end(&myth, FLUSH_ALL); } } /* * We must release the write lock here because * the workqueue job (flush_async_commit) needs this lock */ reiserfs_write_unlock(sb); /* * Cancel flushing of old commits. Note that neither of these works * will be requeued because superblock is being shutdown and doesn't * have SB_ACTIVE set. */ reiserfs_cancel_old_flush(sb); /* wait for all commits to finish */ cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work); free_journal_ram(sb); reiserfs_write_lock(sb); return 0; } /* * call on unmount. flush all journal trans, release all alloc'd ram */ int journal_release(struct reiserfs_transaction_handle *th, struct super_block *sb) { return do_journal_release(th, sb, 0); } /* only call from an error condition inside reiserfs_read_super! */ int journal_release_error(struct reiserfs_transaction_handle *th, struct super_block *sb) { return do_journal_release(th, sb, 1); } /* * compares description block with commit block. * returns 1 if they differ, 0 if they are the same */ static int journal_compare_desc_commit(struct super_block *sb, struct reiserfs_journal_desc *desc, struct reiserfs_journal_commit *commit) { if (get_commit_trans_id(commit) != get_desc_trans_id(desc) || get_commit_trans_len(commit) != get_desc_trans_len(desc) || get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max || get_commit_trans_len(commit) <= 0) { return 1; } return 0; } /* * returns 0 if it did not find a description block * returns -1 if it found a corrupt commit block * returns 1 if both desc and commit were valid * NOTE: only called during fs mount */ static int journal_transaction_is_valid(struct super_block *sb, struct buffer_head *d_bh, unsigned int *oldest_invalid_trans_id, unsigned long *newest_mount_id) { struct reiserfs_journal_desc *desc; struct reiserfs_journal_commit *commit; struct buffer_head *c_bh; unsigned long offset; if (!d_bh) return 0; desc = (struct reiserfs_journal_desc *)d_bh->b_data; if (get_desc_trans_len(desc) > 0 && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) { if (oldest_invalid_trans_id && *oldest_invalid_trans_id && get_desc_trans_id(desc) > *oldest_invalid_trans_id) { reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-986: transaction " "is valid returning because trans_id %d is greater than " "oldest_invalid %lu", get_desc_trans_id(desc), *oldest_invalid_trans_id); return 0; } if (newest_mount_id && *newest_mount_id > get_desc_mount_id(desc)) { reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1087: transaction " "is valid returning because mount_id %d is less than " "newest_mount_id %lu", get_desc_mount_id(desc), *newest_mount_id); return -1; } if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) { reiserfs_warning(sb, "journal-2018", "Bad transaction length %d " "encountered, ignoring transaction", get_desc_trans_len(desc)); return -1; } offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); /* * ok, we have a journal description block, * let's see if the transaction was valid */ c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + ((offset + get_desc_trans_len(desc) + 1) % SB_ONDISK_JOURNAL_SIZE(sb))); if (!c_bh) return 0; commit = (struct reiserfs_journal_commit *)c_bh->b_data; if (journal_compare_desc_commit(sb, desc, commit)) { reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal_transaction_is_valid, commit offset %ld had bad " "time %d or length %d", c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb), get_commit_trans_id(commit), get_commit_trans_len(commit)); brelse(c_bh); if (oldest_invalid_trans_id) { *oldest_invalid_trans_id = get_desc_trans_id(desc); reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1004: " "transaction_is_valid setting oldest invalid trans_id " "to %d", get_desc_trans_id(desc)); } return -1; } brelse(c_bh); reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1006: found valid " "transaction start offset %llu, len %d id %d", d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb), get_desc_trans_len(desc), get_desc_trans_id(desc)); return 1; } else { return 0; } } static void brelse_array(struct buffer_head **heads, int num) { int i; for (i = 0; i < num; i++) { brelse(heads[i]); } } /* * given the start, and values for the oldest acceptable transactions, * this either reads in a replays a transaction, or returns because the * transaction is invalid, or too old. * NOTE: only called during fs mount */ static int journal_read_transaction(struct super_block *sb, unsigned long cur_dblock, unsigned long oldest_start, unsigned int oldest_trans_id, unsigned long newest_mount_id) { struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_journal_desc *desc; struct reiserfs_journal_commit *commit; unsigned int trans_id = 0; struct buffer_head *c_bh; struct buffer_head *d_bh; struct buffer_head **log_blocks = NULL; struct buffer_head **real_blocks = NULL; unsigned int trans_offset; int i; int trans_half; d_bh = journal_bread(sb, cur_dblock); if (!d_bh) return 1; desc = (struct reiserfs_journal_desc *)d_bh->b_data; trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: " "journal_read_transaction, offset %llu, len %d mount_id %d", d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb), get_desc_trans_len(desc), get_desc_mount_id(desc)); if (get_desc_trans_id(desc) < oldest_trans_id) { reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: " "journal_read_trans skipping because %lu is too old", cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb)); brelse(d_bh); return 1; } if (get_desc_mount_id(desc) != newest_mount_id) { reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: " "journal_read_trans skipping because %d is != " "newest_mount_id %lu", get_desc_mount_id(desc), newest_mount_id); brelse(d_bh); return 1; } c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + ((trans_offset + get_desc_trans_len(desc) + 1) % SB_ONDISK_JOURNAL_SIZE(sb))); if (!c_bh) { brelse(d_bh); return 1; } commit = (struct reiserfs_journal_commit *)c_bh->b_data; if (journal_compare_desc_commit(sb, desc, commit)) { reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal_read_transaction, " "commit offset %llu had bad time %d or length %d", c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb), get_commit_trans_id(commit), get_commit_trans_len(commit)); brelse(c_bh); brelse(d_bh); return 1; } if (bdev_read_only(sb->s_bdev)) { reiserfs_warning(sb, "clm-2076", "device is readonly, unable to replay log"); brelse(c_bh); brelse(d_bh); return -EROFS; } trans_id = get_desc_trans_id(desc); /* * now we know we've got a good transaction, and it was * inside the valid time ranges */ log_blocks = kmalloc_array(get_desc_trans_len(desc), sizeof(struct buffer_head *), GFP_NOFS); real_blocks = kmalloc_array(get_desc_trans_len(desc), sizeof(struct buffer_head *), GFP_NOFS); if (!log_blocks || !real_blocks) { brelse(c_bh); brelse(d_bh); kfree(log_blocks); kfree(real_blocks); reiserfs_warning(sb, "journal-1169", "kmalloc failed, unable to mount FS"); return -1; } /* get all the buffer heads */ trans_half = journal_trans_half(sb->s_blocksize); for (i = 0; i < get_desc_trans_len(desc); i++) { log_blocks[i] = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + (trans_offset + 1 + i) % SB_ONDISK_JOURNAL_SIZE(sb)); if (i < trans_half) { real_blocks[i] = sb_getblk(sb, le32_to_cpu(desc->j_realblock[i])); } else { real_blocks[i] = sb_getblk(sb, le32_to_cpu(commit-> j_realblock[i - trans_half])); } if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) { reiserfs_warning(sb, "journal-1207", "REPLAY FAILURE fsck required! " "Block to replay is outside of " "filesystem"); goto abort_replay; } /* make sure we don't try to replay onto log or reserved area */ if (is_block_in_log_or_reserved_area (sb, real_blocks[i]->b_blocknr)) { reiserfs_warning(sb, "journal-1204", "REPLAY FAILURE fsck required! " "Trying to replay onto a log block"); abort_replay: brelse_array(log_blocks, i); brelse_array(real_blocks, i); brelse(c_bh); brelse(d_bh); kfree(log_blocks); kfree(real_blocks); return -1; } } /* read in the log blocks, memcpy to the corresponding real block */ bh_read_batch(get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(log_blocks[i]); if (!buffer_uptodate(log_blocks[i])) { reiserfs_warning(sb, "journal-1212", "REPLAY FAILURE fsck required! " "buffer write failed"); brelse_array(log_blocks + i, get_desc_trans_len(desc) - i); brelse_array(real_blocks, get_desc_trans_len(desc)); brelse(c_bh); brelse(d_bh); kfree(log_blocks); kfree(real_blocks); return -1; } memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, real_blocks[i]->b_size); set_buffer_uptodate(real_blocks[i]); brelse(log_blocks[i]); } /* flush out the real blocks */ for (i = 0; i < get_desc_trans_len(desc); i++) { set_buffer_dirty(real_blocks[i]); write_dirty_buffer(real_blocks[i], 0); } for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(real_blocks[i]); if (!buffer_uptodate(real_blocks[i])) { reiserfs_warning(sb, "journal-1226", "REPLAY FAILURE, fsck required! " "buffer write failed"); brelse_array(real_blocks + i, get_desc_trans_len(desc) - i); brelse(c_bh); brelse(d_bh); kfree(log_blocks); kfree(real_blocks); return -1; } brelse(real_blocks[i]); } cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb) + ((trans_offset + get_desc_trans_len(desc) + 2) % SB_ONDISK_JOURNAL_SIZE(sb)); reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1095: setting journal " "start to offset %ld", cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb)); /* * init starting values for the first transaction, in case * this is the last transaction to be replayed. */ journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb); journal->j_last_flush_trans_id = trans_id; journal->j_trans_id = trans_id + 1; /* check for trans_id overflow */ if (journal->j_trans_id == 0) journal->j_trans_id = 10; brelse(c_bh); brelse(d_bh); kfree(log_blocks); kfree(real_blocks); return 0; } /* * This function reads blocks starting from block and to max_block of bufsize * size (but no more than BUFNR blocks at a time). This proved to improve * mounting speed on self-rebuilding raid5 arrays at least. * Right now it is only used from journal code. But later we might use it * from other places. * Note: Do not use journal_getblk/sb_getblk functions here! */ static struct buffer_head *reiserfs_breada(struct block_device *dev, b_blocknr_t block, int bufsize, b_blocknr_t max_block) { struct buffer_head *bhlist[BUFNR]; unsigned int blocks = BUFNR; struct buffer_head *bh; int i, j; bh = __getblk(dev, block, bufsize); if (!bh || buffer_uptodate(bh)) return (bh); if (block + BUFNR > max_block) { blocks = max_block - block; } bhlist[0] = bh; j = 1; for (i = 1; i < blocks; i++) { bh = __getblk(dev, block + i, bufsize); if (!bh) break; if (buffer_uptodate(bh)) { brelse(bh); break; } else bhlist[j++] = bh; } bh = bhlist[0]; bh_read_nowait(bh, 0); bh_readahead_batch(j - 1, &bhlist[1], 0); for (i = 1; i < j; i++) brelse(bhlist[i]); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; brelse(bh); return NULL; } /* * read and replay the log * on a clean unmount, the journal header's next unflushed pointer will be * to an invalid transaction. This tests that before finding all the * transactions in the log, which makes normal mount times fast. * * After a crash, this starts with the next unflushed transaction, and * replays until it finds one too old, or invalid. * * On exit, it sets things up so the first transaction will work correctly. * NOTE: only called during fs mount */ static int journal_read(struct super_block *sb) { struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_journal_desc *desc; unsigned int oldest_trans_id = 0; unsigned int oldest_invalid_trans_id = 0; time64_t start; unsigned long oldest_start = 0; unsigned long cur_dblock = 0; unsigned long newest_mount_id = 9; struct buffer_head *d_bh; struct reiserfs_journal_header *jh; int valid_journal_header = 0; int replay_count = 0; int continue_replay = 1; int ret; cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); reiserfs_info(sb, "checking transaction log (%pg)\n", file_bdev(journal->j_bdev_file)); start = ktime_get_seconds(); /* * step 1, read in the journal header block. Check the transaction * it says is the first unflushed, and if that transaction is not * valid, replay is done */ journal->j_header_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb)); if (!journal->j_header_bh) { return 1; } jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data); if (le32_to_cpu(jh->j_first_unflushed_offset) < SB_ONDISK_JOURNAL_SIZE(sb) && le32_to_cpu(jh->j_last_flush_trans_id) > 0) { oldest_start = SB_ONDISK_JOURNAL_1st_BLOCK(sb) + le32_to_cpu(jh->j_first_unflushed_offset); oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; newest_mount_id = le32_to_cpu(jh->j_mount_id); reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1153: found in " "header: first_unflushed_offset %d, last_flushed_trans_id " "%lu", le32_to_cpu(jh->j_first_unflushed_offset), le32_to_cpu(jh->j_last_flush_trans_id)); valid_journal_header = 1; /* * now, we try to read the first unflushed offset. If it * is not valid, there is nothing more we can do, and it * makes no sense to read through the whole log. */ d_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + le32_to_cpu(jh->j_first_unflushed_offset)); ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL); if (!ret) { continue_replay = 0; } brelse(d_bh); goto start_log_replay; } /* * ok, there are transactions that need to be replayed. start * with the first log block, find all the valid transactions, and * pick out the oldest. */ while (continue_replay && cur_dblock < (SB_ONDISK_JOURNAL_1st_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb))) { /* * Note that it is required for blocksize of primary fs * device and journal device to be the same */ d_bh = reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock, sb->s_blocksize, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb)); ret = journal_transaction_is_valid(sb, d_bh, &oldest_invalid_trans_id, &newest_mount_id); if (ret == 1) { desc = (struct reiserfs_journal_desc *)d_bh->b_data; if (oldest_start == 0) { /* init all oldest_ values */ oldest_trans_id = get_desc_trans_id(desc); oldest_start = d_bh->b_blocknr; newest_mount_id = get_desc_mount_id(desc); reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1179: Setting " "oldest_start to offset %llu, trans_id %lu", oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK (sb), oldest_trans_id); } else if (oldest_trans_id > get_desc_trans_id(desc)) { /* one we just read was older */ oldest_trans_id = get_desc_trans_id(desc); oldest_start = d_bh->b_blocknr; reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1180: Resetting " "oldest_start to offset %lu, trans_id %lu", oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK (sb), oldest_trans_id); } if (newest_mount_id < get_desc_mount_id(desc)) { newest_mount_id = get_desc_mount_id(desc); reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " "newest_mount_id to %d", get_desc_mount_id(desc)); } cur_dblock += get_desc_trans_len(desc) + 2; } else { cur_dblock++; } brelse(d_bh); } start_log_replay: cur_dblock = oldest_start; if (oldest_trans_id) { reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay " "from offset %llu, trans_id %lu", cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb), oldest_trans_id); } replay_count = 0; while (continue_replay && oldest_trans_id > 0) { ret = journal_read_transaction(sb, cur_dblock, oldest_start, oldest_trans_id, newest_mount_id); if (ret < 0) { return ret; } else if (ret != 0) { break; } cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start; replay_count++; if (cur_dblock == oldest_start) break; } if (oldest_trans_id == 0) { reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1225: No valid " "transactions found"); } /* * j_start does not get set correctly if we don't replay any * transactions. if we had a valid journal_header, set j_start * to the first unflushed transaction value, copy the trans_id * from the header */ if (valid_journal_header && replay_count == 0) { journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset); journal->j_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; /* check for trans_id overflow */ if (journal->j_trans_id == 0) journal->j_trans_id = 10; journal->j_last_flush_trans_id = le32_to_cpu(jh->j_last_flush_trans_id); journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1; } else { journal->j_mount_id = newest_mount_id + 1; } reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " "newest_mount_id to %lu", journal->j_mount_id); journal->j_first_unflushed_offset = journal->j_start; if (replay_count > 0) { reiserfs_info(sb, "replayed %d transactions in %lu seconds\n", replay_count, ktime_get_seconds() - start); } /* needed to satisfy the locking in _update_journal_header_block */ reiserfs_write_lock(sb); if (!bdev_read_only(sb->s_bdev) && _update_journal_header_block(sb, journal->j_start, journal->j_last_flush_trans_id)) { reiserfs_write_unlock(sb); /* * replay failed, caller must call free_journal_ram and abort * the mount */ return -1; } reiserfs_write_unlock(sb); return 0; } static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) { struct reiserfs_journal_list *jl; jl = kzalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&jl->j_list); INIT_LIST_HEAD(&jl->j_working_list); INIT_LIST_HEAD(&jl->j_tail_bh_list); INIT_LIST_HEAD(&jl->j_bh_list); mutex_init(&jl->j_commit_mutex); SB_JOURNAL(s)->j_num_lists++; get_journal_list(jl); return jl; } static void journal_list_init(struct super_block *sb) { SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb); } static void release_journal_dev(struct reiserfs_journal *journal) { if (journal->j_bdev_file) { bdev_fput(journal->j_bdev_file); journal->j_bdev_file = NULL; } } static int journal_init_dev(struct super_block *super, struct reiserfs_journal *journal, const char *jdev_name) { blk_mode_t blkdev_mode = BLK_OPEN_READ; void *holder = journal; int result; dev_t jdev; result = 0; journal->j_bdev_file = NULL; jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; if (!bdev_read_only(super->s_bdev)) blkdev_mode |= BLK_OPEN_WRITE; /* there is no "jdev" option and journal is on separate device */ if ((!jdev_name || !jdev_name[0])) { if (jdev == super->s_dev) holder = NULL; journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode, holder, NULL); if (IS_ERR(journal->j_bdev_file)) { result = PTR_ERR(journal->j_bdev_file); journal->j_bdev_file = NULL; reiserfs_warning(super, "sh-458", "cannot init journal device unknown-block(%u,%u): %i", MAJOR(jdev), MINOR(jdev), result); return result; } else if (jdev != super->s_dev) set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize); return 0; } journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode, holder, NULL); if (IS_ERR(journal->j_bdev_file)) { result = PTR_ERR(journal->j_bdev_file); journal->j_bdev_file = NULL; reiserfs_warning(super, "sh-457", "journal_init_dev: Cannot open '%s': %i", jdev_name, result); return result; } set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize); reiserfs_info(super, "journal_init_dev: journal device: %pg\n", file_bdev(journal->j_bdev_file)); return 0; } /* * When creating/tuning a file system user can assign some * journal params within boundaries which depend on the ratio * blocksize/standard_blocksize. * * For blocks >= standard_blocksize transaction size should * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more * then JOURNAL_TRANS_MAX_DEFAULT. * * For blocks < standard_blocksize these boundaries should be * decreased proportionally. */ #define REISERFS_STANDARD_BLKSIZE (4096) static int check_advise_trans_params(struct super_block *sb, struct reiserfs_journal *journal) { if (journal->j_trans_max) { /* Non-default journal params. Do sanity check for them. */ int ratio = 1; if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE) ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize; if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio || journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio || SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max < JOURNAL_MIN_RATIO) { reiserfs_warning(sb, "sh-462", "bad transaction max size (%u). " "FSCK?", journal->j_trans_max); return 1; } if (journal->j_max_batch != (journal->j_trans_max) * JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) { reiserfs_warning(sb, "sh-463", "bad transaction max batch (%u). " "FSCK?", journal->j_max_batch); return 1; } } else { /* * Default journal params. * The file system was created by old version * of mkreiserfs, so some fields contain zeros, * and we need to advise proper values for them */ if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) { reiserfs_warning(sb, "sh-464", "bad blocksize (%u)", sb->s_blocksize); return 1; } journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT; journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT; journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE; } return 0; } /* must be called once on fs mount. calls journal_read for you */ int journal_init(struct super_block *sb, const char *j_dev_name, int old_format, unsigned int commit_max_age) { int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2; struct buffer_head *bhjh; struct reiserfs_super_block *rs; struct reiserfs_journal_header *jh; struct reiserfs_journal *journal; struct reiserfs_journal_list *jl; int ret; journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); if (!journal) { reiserfs_warning(sb, "journal-1256", "unable to get memory for journal structure"); return 1; } INIT_LIST_HEAD(&journal->j_bitmap_nodes); INIT_LIST_HEAD(&journal->j_prealloc_list); INIT_LIST_HEAD(&journal->j_working_list); INIT_LIST_HEAD(&journal->j_journal_list); journal->j_persistent_trans = 0; if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, reiserfs_bmap_count(sb))) goto free_and_return; allocate_bitmap_nodes(sb); /* reserved for journal area support */ SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ? REISERFS_OLD_DISK_OFFSET_IN_BYTES / sb->s_blocksize + reiserfs_bmap_count(sb) + 1 : REISERFS_DISK_OFFSET_IN_BYTES / sb->s_blocksize + 2); /* * Sanity check to see is the standard journal fitting * within first bitmap (actual for small blocksizes) */ if (!SB_ONDISK_JOURNAL_DEVICE(sb) && (SB_JOURNAL_1st_RESERVED_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) { reiserfs_warning(sb, "journal-1393", "journal does not fit for area addressed " "by first of bitmap blocks. It starts at " "%u and its size is %u. Block size %ld", SB_JOURNAL_1st_RESERVED_BLOCK(sb), SB_ONDISK_JOURNAL_SIZE(sb), sb->s_blocksize); goto free_and_return; } /* * Sanity check to see if journal first block is correct. * If journal first block is invalid it can cause * zeroing important superblock members. */ if (!SB_ONDISK_JOURNAL_DEVICE(sb) && SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) { reiserfs_warning(sb, "journal-1393", "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d", SB_JOURNAL_1st_RESERVED_BLOCK(sb), SB_ONDISK_JOURNAL_1st_BLOCK(sb)); goto free_and_return; } if (journal_init_dev(sb, journal, j_dev_name) != 0) { reiserfs_warning(sb, "sh-462", "unable to initialize journal device"); goto free_and_return; } rs = SB_DISK_SUPER_BLOCK(sb); /* read journal header */ bhjh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb)); if (!bhjh) { reiserfs_warning(sb, "sh-459", "unable to read journal header"); goto free_and_return; } jh = (struct reiserfs_journal_header *)(bhjh->b_data); /* make sure that journal matches to the super block */ if (is_reiserfs_jr(rs) && (le32_to_cpu(jh->jh_journal.jp_journal_magic) != sb_jp_journal_magic(rs))) { reiserfs_warning(sb, "sh-460", "journal header magic %x (device %pg) does " "not match to magic found in super block %x", jh->jh_journal.jp_journal_magic, file_bdev(journal->j_bdev_file), sb_jp_journal_magic(rs)); brelse(bhjh); goto free_and_return; } journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max); journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch); journal->j_max_commit_age = le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age); journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; if (check_advise_trans_params(sb, journal) != 0) goto free_and_return; journal->j_default_max_commit_age = journal->j_max_commit_age; if (commit_max_age != 0) { journal->j_max_commit_age = commit_max_age; journal->j_max_trans_age = commit_max_age; } reiserfs_info(sb, "journal params: device %pg, size %u, " "journal first block %u, max trans len %u, max batch %u, " "max commit age %u, max trans age %u\n", file_bdev(journal->j_bdev_file), SB_ONDISK_JOURNAL_SIZE(sb), SB_ONDISK_JOURNAL_1st_BLOCK(sb), journal->j_trans_max, journal->j_max_batch, journal->j_max_commit_age, journal->j_max_trans_age); brelse(bhjh); journal->j_list_bitmap_index = 0; journal_list_init(sb); memset(journal->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); INIT_LIST_HEAD(&journal->j_dirty_buffers); spin_lock_init(&journal->j_dirty_buffers_lock); journal->j_start = 0; journal->j_len = 0; journal->j_len_alloc = 0; atomic_set(&journal->j_wcount, 0); atomic_set(&journal->j_async_throttle, 0); journal->j_bcount = 0; journal->j_trans_start_time = 0; journal->j_last = NULL; journal->j_first = NULL; init_waitqueue_head(&journal->j_join_wait); mutex_init(&journal->j_mutex); mutex_init(&journal->j_flush_mutex); journal->j_trans_id = 10; journal->j_mount_id = 10; journal->j_state = 0; atomic_set(&journal->j_jlock, 0); journal->j_cnode_free_list = allocate_cnodes(num_cnodes); journal->j_cnode_free_orig = journal->j_cnode_free_list; journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; journal->j_cnode_used = 0; journal->j_must_wait = 0; if (journal->j_cnode_free == 0) { reiserfs_warning(sb, "journal-2004", "Journal cnode memory " "allocation failed (%ld bytes). Journal is " "too large for available memory. Usually " "this is due to a journal that is too large.", sizeof (struct reiserfs_journal_cnode) * num_cnodes); goto free_and_return; } init_journal_hash(sb); jl = journal->j_current_jl; /* * get_list_bitmap() may call flush_commit_list() which * requires the lock. Calling flush_commit_list() shouldn't happen * this early but I like to be paranoid. */ reiserfs_write_lock(sb); jl->j_list_bitmap = get_list_bitmap(sb, jl); reiserfs_write_unlock(sb); if (!jl->j_list_bitmap) { reiserfs_warning(sb, "journal-2005", "get_list_bitmap failed for journal list 0"); goto free_and_return; } ret = journal_read(sb); if (ret < 0) { reiserfs_warning(sb, "reiserfs-2006", "Replay Failure, unable to mount"); goto free_and_return; } INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); journal->j_work_sb = sb; return 0; free_and_return: free_journal_ram(sb); return 1; } /* * test for a polite end of the current transaction. Used by file_write, * and should be used by delete to make sure they don't write more than * can fit inside a single transaction */ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) { struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); time64_t now = ktime_get_seconds(); /* cannot restart while nested */ BUG_ON(!th->t_trans_id); if (th->t_refcount > 1) return 0; if (journal->j_must_wait > 0 || (journal->j_len_alloc + new_alloc) >= journal->j_max_batch || atomic_read(&journal->j_jlock) || (now - journal->j_trans_start_time) > journal->j_max_trans_age || journal->j_cnode_free < (journal->j_trans_max * 3)) { return 1; } journal->j_len_alloc += new_alloc; th->t_blocks_allocated += new_alloc ; return 0; } /* this must be called inside a transaction */ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) { struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); BUG_ON(!th->t_trans_id); journal->j_must_wait = 1; set_bit(J_WRITERS_BLOCKED, &journal->j_state); return; } /* this must be called without a transaction started */ void reiserfs_allow_writes(struct super_block *s) { struct reiserfs_journal *journal = SB_JOURNAL(s); clear_bit(J_WRITERS_BLOCKED, &journal->j_state); wake_up(&journal->j_join_wait); } /* this must be called without a transaction started */ void reiserfs_wait_on_write_block(struct super_block *s) { struct reiserfs_journal *journal = SB_JOURNAL(s); wait_event(journal->j_join_wait, !test_bit(J_WRITERS_BLOCKED, &journal->j_state)); } static void queue_log_writer(struct super_block *s) { wait_queue_entry_t wait; struct reiserfs_journal *journal = SB_JOURNAL(s); set_bit(J_WRITERS_QUEUED, &journal->j_state); /* * we don't want to use wait_event here because * we only want to wait once. */ init_waitqueue_entry(&wait, current); add_wait_queue(&journal->j_join_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) { int depth = reiserfs_write_unlock_nested(s); schedule(); reiserfs_write_lock_nested(s, depth); } __set_current_state(TASK_RUNNING); remove_wait_queue(&journal->j_join_wait, &wait); } static void wake_queued_writers(struct super_block *s) { struct reiserfs_journal *journal = SB_JOURNAL(s); if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state)) wake_up(&journal->j_join_wait); } static void let_transaction_grow(struct super_block *sb, unsigned int trans_id) { struct reiserfs_journal *journal = SB_JOURNAL(sb); unsigned long bcount = journal->j_bcount; while (1) { int depth; depth = reiserfs_write_unlock_nested(sb); schedule_timeout_uninterruptible(1); reiserfs_write_lock_nested(sb, depth); journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; while ((atomic_read(&journal->j_wcount) > 0 || atomic_read(&journal->j_jlock)) && journal->j_trans_id == trans_id) { queue_log_writer(sb); } if (journal->j_trans_id != trans_id) break; if (bcount == journal->j_bcount) break; bcount = journal->j_bcount; } } /* * join == true if you must join an existing transaction. * join == false if you can deal with waiting for others to finish * * this will block until the transaction is joinable. send the number of * blocks you expect to use in nblocks. */ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block *sb, unsigned long nblocks, int join) { time64_t now = ktime_get_seconds(); unsigned int old_trans_id; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_transaction_handle myth; int retval; int depth; reiserfs_check_lock_depth(sb, "journal_begin"); BUG_ON(nblocks > journal->j_trans_max); PROC_INFO_INC(sb, journal.journal_being); /* set here for journal_join */ th->t_refcount = 1; th->t_super = sb; relock: lock_journal(sb); if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) { unlock_journal(sb); retval = journal->j_errno; goto out_fail; } journal->j_bcount++; if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { unlock_journal(sb); depth = reiserfs_write_unlock_nested(sb); reiserfs_wait_on_write_block(sb); reiserfs_write_lock_nested(sb, depth); PROC_INFO_INC(sb, journal.journal_relock_writers); goto relock; } now = ktime_get_seconds(); /* * if there is no room in the journal OR * if this transaction is too old, and we weren't called joinable, * wait for it to finish before beginning we don't sleep if there * aren't other writers */ if ((!join && journal->j_must_wait > 0) || (!join && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) || (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 && (now - journal->j_trans_start_time) > journal->j_max_trans_age) || (!join && atomic_read(&journal->j_jlock)) || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { old_trans_id = journal->j_trans_id; /* allow others to finish this transaction */ unlock_journal(sb); if (!join && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch && ((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75)) { if (atomic_read(&journal->j_wcount) > 10) { queue_log_writer(sb); goto relock; } } /* * don't mess with joining the transaction if all we * have to do is wait for someone else to do a commit */ if (atomic_read(&journal->j_jlock)) { while (journal->j_trans_id == old_trans_id && atomic_read(&journal->j_jlock)) { queue_log_writer(sb); } goto relock; } retval = journal_join(&myth, sb); if (retval) goto out_fail; /* someone might have ended the transaction while we joined */ if (old_trans_id != journal->j_trans_id) { retval = do_journal_end(&myth, 0); } else { retval = do_journal_end(&myth, COMMIT_NOW); } if (retval) goto out_fail; PROC_INFO_INC(sb, journal.journal_relock_wcount); goto relock; } /* we are the first writer, set trans_id */ if (journal->j_trans_start_time == 0) { journal->j_trans_start_time = ktime_get_seconds(); } atomic_inc(&journal->j_wcount); journal->j_len_alloc += nblocks; th->t_blocks_logged = 0; th->t_blocks_allocated = nblocks; th->t_trans_id = journal->j_trans_id; unlock_journal(sb); INIT_LIST_HEAD(&th->t_list); return 0; out_fail: memset(th, 0, sizeof(*th)); /* * Re-set th->t_super, so we can properly keep track of how many * persistent transactions there are. We need to do this so if this * call is part of a failed restart_transaction, we can free it later */ th->t_super = sb; return retval; } struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct super_block *s, int nblocks) { int ret; struct reiserfs_transaction_handle *th; /* * if we're nesting into an existing transaction. It will be * persistent on its own */ if (reiserfs_transaction_running(s)) { th = current->journal_info; th->t_refcount++; BUG_ON(th->t_refcount < 2); return th; } th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS); if (!th) return NULL; ret = journal_begin(th, s, nblocks); if (ret) { kfree(th); return NULL; } SB_JOURNAL(s)->j_persistent_trans++; return th; } int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) { struct super_block *s = th->t_super; int ret = 0; if (th->t_trans_id) ret = journal_end(th); else ret = -EIO; if (th->t_refcount == 0) { SB_JOURNAL(s)->j_persistent_trans--; kfree(th); } return ret; } static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *sb) { struct reiserfs_transaction_handle *cur_th = current->journal_info; /* * this keeps do_journal_end from NULLing out the * current->journal_info pointer */ th->t_handle_save = cur_th; BUG_ON(cur_th && cur_th->t_refcount > 1); return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN); } int journal_join_abort(struct reiserfs_transaction_handle *th, struct super_block *sb) { struct reiserfs_transaction_handle *cur_th = current->journal_info; /* * this keeps do_journal_end from NULLing out the * current->journal_info pointer */ th->t_handle_save = cur_th; BUG_ON(cur_th && cur_th->t_refcount > 1); return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT); } int journal_begin(struct reiserfs_transaction_handle *th, struct super_block *sb, unsigned long nblocks) { struct reiserfs_transaction_handle *cur_th = current->journal_info; int ret; th->t_handle_save = NULL; if (cur_th) { /* we are nesting into the current transaction */ if (cur_th->t_super == sb) { BUG_ON(!cur_th->t_refcount); cur_th->t_refcount++; memcpy(th, cur_th, sizeof(*th)); if (th->t_refcount <= 1) reiserfs_warning(sb, "reiserfs-2005", "BAD: refcount <= 1, but " "journal_info != 0"); return 0; } else { /* * we've ended up with a handle from a different * filesystem. save it and restore on journal_end. * This should never really happen... */ reiserfs_warning(sb, "clm-2100", "nesting info a different FS"); th->t_handle_save = current->journal_info; current->journal_info = th; } } else { current->journal_info = th; } ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG); BUG_ON(current->journal_info != th); /* * I guess this boils down to being the reciprocal of clm-2100 above. * If do_journal_begin_r fails, we need to put it back, since * journal_end won't be called to do it. */ if (ret) current->journal_info = th->t_handle_save; else BUG_ON(!th->t_refcount); return ret; } /* * puts bh into the current transaction. If it was already there, reorders * removes the old pointers from the hash, and puts new ones in (to make * sure replay happen in the right order). * * if it was dirty, cleans and files onto the clean list. I can't let it * be dirty again until the transaction is committed. * * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. */ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct buffer_head *bh) { struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_journal_cnode *cn = NULL; int count_already_incd = 0; int prepared = 0; BUG_ON(!th->t_trans_id); PROC_INFO_INC(sb, journal.mark_dirty); if (th->t_trans_id != journal->j_trans_id) { reiserfs_panic(th->t_super, "journal-1577", "handle trans id %ld != current trans id %ld", th->t_trans_id, journal->j_trans_id); } prepared = test_clear_buffer_journal_prepared(bh); clear_buffer_journal_restore_dirty(bh); /* already in this transaction, we are done */ if (buffer_journaled(bh)) { PROC_INFO_INC(sb, journal.mark_dirty_already); return 0; } /* * this must be turned into a panic instead of a warning. We can't * allow a dirty or journal_dirty or locked buffer to be logged, as * some changes could get to disk too early. NOT GOOD. */ if (!prepared || buffer_dirty(bh)) { reiserfs_warning(sb, "journal-1777", "buffer %llu bad state " "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT", (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!', buffer_locked(bh) ? ' ' : '!', buffer_dirty(bh) ? ' ' : '!', buffer_journal_dirty(bh) ? ' ' : '!'); } if (atomic_read(&journal->j_wcount) <= 0) { reiserfs_warning(sb, "journal-1409", "returning because j_wcount was %d", atomic_read(&journal->j_wcount)); return 1; } /* * this error means I've screwed up, and we've overflowed * the transaction. Nothing can be done here, except make the * FS readonly or panic. */ if (journal->j_len >= journal->j_trans_max) { reiserfs_panic(th->t_super, "journal-1413", "j_len (%lu) is too big", journal->j_len); } if (buffer_journal_dirty(bh)) { count_already_incd = 1; PROC_INFO_INC(sb, journal.mark_dirty_notjournal); clear_buffer_journal_dirty(bh); } if (journal->j_len > journal->j_len_alloc) { journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT; } set_buffer_journaled(bh); /* now put this guy on the end */ if (!cn) { cn = get_cnode(sb); if (!cn) { reiserfs_panic(sb, "journal-4", "get_cnode failed!"); } if (th->t_blocks_logged == th->t_blocks_allocated) { th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT; journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT; } th->t_blocks_logged++; journal->j_len++; cn->bh = bh; cn->blocknr = bh->b_blocknr; cn->sb = sb; cn->jlist = NULL; insert_journal_hash(journal->j_hash_table, cn); if (!count_already_incd) { get_bh(bh); } } cn->next = NULL; cn->prev = journal->j_last; cn->bh = bh; if (journal->j_last) { journal->j_last->next = cn; journal->j_last = cn; } else { journal->j_first = cn; journal->j_last = cn; } reiserfs_schedule_old_flush(sb); return 0; } int journal_end(struct reiserfs_transaction_handle *th) { struct super_block *sb = th->t_super; if (!current->journal_info && th->t_refcount > 1) reiserfs_warning(sb, "REISER-NESTING", "th NULL, refcount %d", th->t_refcount); if (!th->t_trans_id) { WARN_ON(1); return -EIO; } th->t_refcount--; if (th->t_refcount > 0) { struct reiserfs_transaction_handle *cur_th = current->journal_info; /* * we aren't allowed to close a nested transaction on a * different filesystem from the one in the task struct */ BUG_ON(cur_th->t_super != th->t_super); if (th != cur_th) { memcpy(current->journal_info, th, sizeof(*th)); th->t_trans_id = 0; } return 0; } else { return do_journal_end(th, 0); } } /* * removes from the current transaction, relsing and descrementing any counters. * also files the removed buffer directly onto the clean list * * called by journal_mark_freed when a block has been deleted * * returns 1 if it cleaned and relsed the buffer. 0 otherwise */ static int remove_from_transaction(struct super_block *sb, b_blocknr_t blocknr, int already_cleaned) { struct buffer_head *bh; struct reiserfs_journal_cnode *cn; struct reiserfs_journal *journal = SB_JOURNAL(sb); int ret = 0; cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); if (!cn || !cn->bh) { return ret; } bh = cn->bh; if (cn->prev) { cn->prev->next = cn->next; } if (cn->next) { cn->next->prev = cn->prev; } if (cn == journal->j_first) { journal->j_first = cn->next; } if (cn == journal->j_last) { journal->j_last = cn->prev; } remove_journal_hash(sb, journal->j_hash_table, NULL, bh->b_blocknr, 0); clear_buffer_journaled(bh); /* don't log this one */ if (!already_cleaned) { clear_buffer_journal_dirty(bh); clear_buffer_dirty(bh); clear_buffer_journal_test(bh); put_bh(bh); if (atomic_read(&bh->b_count) < 0) { reiserfs_warning(sb, "journal-1752", "b_count < 0"); } ret = 1; } journal->j_len--; journal->j_len_alloc--; free_cnode(sb, cn); return ret; } /* * for any cnode in a journal list, it can only be dirtied of all the * transactions that include it are committed to disk. * this checks through each transaction, and returns 1 if you are allowed * to dirty, and 0 if you aren't * * it is called by dirty_journal_list, which is called after * flush_commit_list has gotten all the log blocks for a given * transaction on disk * */ static int can_dirty(struct reiserfs_journal_cnode *cn) { struct super_block *sb = cn->sb; b_blocknr_t blocknr = cn->blocknr; struct reiserfs_journal_cnode *cur = cn->hprev; int can_dirty = 1; /* * first test hprev. These are all newer than cn, so any node here * with the same block number and dev means this node can't be sent * to disk right now. */ while (cur && can_dirty) { if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { can_dirty = 0; } cur = cur->hprev; } /* * then test hnext. These are all older than cn. As long as they * are committed to the log, it is safe to write cn to disk */ cur = cn->hnext; while (cur && can_dirty) { if (cur->jlist && cur->jlist->j_len > 0 && atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh && cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { can_dirty = 0; } cur = cur->hnext; } return can_dirty; } /* * syncs the commit blocks, but does not force the real buffers to disk * will wait until the current transaction is done/committed before returning */ int journal_end_sync(struct reiserfs_transaction_handle *th) { struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); BUG_ON(!th->t_trans_id); /* you can sync while nested, very, very bad */ BUG_ON(th->t_refcount > 1); if (journal->j_len == 0) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); } return do_journal_end(th, COMMIT_NOW | WAIT); } /* writeback the pending async commits to disk */ static void flush_async_commits(struct work_struct *work) { struct reiserfs_journal *journal = container_of(work, struct reiserfs_journal, j_work.work); struct super_block *sb = journal->j_work_sb; struct reiserfs_journal_list *jl; struct list_head *entry; reiserfs_write_lock(sb); if (!list_empty(&journal->j_journal_list)) { /* last entry is the youngest, commit it and you get everything */ entry = journal->j_journal_list.prev; jl = JOURNAL_LIST_ENTRY(entry); flush_commit_list(sb, jl, 1); } reiserfs_write_unlock(sb); } /* * flushes any old transactions to disk * ends the current transaction if it is too old */ void reiserfs_flush_old_commits(struct super_block *sb) { time64_t now; struct reiserfs_transaction_handle th; struct reiserfs_journal *journal = SB_JOURNAL(sb); now = ktime_get_seconds(); /* * safety check so we don't flush while we are replaying the log during * mount */ if (list_empty(&journal->j_journal_list)) return; /* * check the current transaction. If there are no writers, and it is * too old, finish it, and force the commit blocks to disk */ if (atomic_read(&journal->j_wcount) <= 0 && journal->j_trans_start_time > 0 && journal->j_len > 0 && (now - journal->j_trans_start_time) > journal->j_max_trans_age) { if (!journal_join(&th, sb)) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); /* * we're only being called from kreiserfsd, it makes * no sense to do an async commit so that kreiserfsd * can do it later */ do_journal_end(&th, COMMIT_NOW | WAIT); } } } /* * returns 0 if do_journal_end should return right away, returns 1 if * do_journal_end should finish the commit * * if the current transaction is too old, but still has writers, this will * wait on j_join_wait until all the writers are done. By the time it * wakes up, the transaction it was called has already ended, so it just * flushes the commit list and returns 0. * * Won't batch when flush or commit_now is set. Also won't batch when * others are waiting on j_join_wait. * * Note, we can't allow the journal_end to proceed while there are still * writers in the log. */ static int check_journal_end(struct reiserfs_transaction_handle *th, int flags) { time64_t now; int flush = flags & FLUSH_ALL; int commit_now = flags & COMMIT_NOW; int wait_on_commit = flags & WAIT; struct reiserfs_journal_list *jl; struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); BUG_ON(!th->t_trans_id); if (th->t_trans_id != journal->j_trans_id) { reiserfs_panic(th->t_super, "journal-1577", "handle trans id %ld != current trans id %ld", th->t_trans_id, journal->j_trans_id); } journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged); /* <= 0 is allowed. unmounting might not call begin */ if (atomic_read(&journal->j_wcount) > 0) atomic_dec(&journal->j_wcount); /* * BUG, deal with case where j_len is 0, but people previously * freed blocks need to be released will be dealt with by next * transaction that actually writes something, but should be taken * care of in this trans */ BUG_ON(journal->j_len == 0); /* * if wcount > 0, and we are called to with flush or commit_now, * we wait on j_join_wait. We will wake up when the last writer has * finished the transaction, and started it on its way to the disk. * Then, we flush the commit or journal list, and just return 0 * because the rest of journal end was already done for this * transaction. */ if (atomic_read(&journal->j_wcount) > 0) { if (flush || commit_now) { unsigned trans_id; jl = journal->j_current_jl; trans_id = jl->j_trans_id; if (wait_on_commit) jl->j_state |= LIST_COMMIT_PENDING; atomic_set(&journal->j_jlock, 1); if (flush) { journal->j_next_full_flush = 1; } unlock_journal(sb); /* * sleep while the current transaction is * still j_jlocked */ while (journal->j_trans_id == trans_id) { if (atomic_read(&journal->j_jlock)) { queue_log_writer(sb); } else { lock_journal(sb); if (journal->j_trans_id == trans_id) { atomic_set(&journal->j_jlock, 1); } unlock_journal(sb); } } BUG_ON(journal->j_trans_id == trans_id); if (commit_now && journal_list_still_alive(sb, trans_id) && wait_on_commit) { flush_commit_list(sb, jl, 1); } return 0; } unlock_journal(sb); return 0; } /* deal with old transactions where we are the last writers */ now = ktime_get_seconds(); if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) { commit_now = 1; journal->j_next_async_flush = 1; } /* don't batch when someone is waiting on j_join_wait */ /* don't batch when syncing the commit or flushing the whole trans */ if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock)) && !flush && !commit_now && (journal->j_len < journal->j_max_batch) && journal->j_len_alloc < journal->j_max_batch && journal->j_cnode_free > (journal->j_trans_max * 3)) { journal->j_bcount++; unlock_journal(sb); return 0; } if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) { reiserfs_panic(sb, "journal-003", "j_start (%ld) is too high", journal->j_start); } return 1; } /* * Does all the work that makes deleting blocks safe. * when deleting a block mark BH_JNew, just remove it from the current * transaction, clean it's buffer_head and move on. * * otherwise: * set a bit for the block in the journal bitmap. That will prevent it from * being allocated for unformatted nodes before this transaction has finished. * * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. * That will prevent any old transactions with this block from trying to flush * to the real location. Since we aren't removing the cnode from the * journal_list_hash, *the block can't be reallocated yet. * * Then remove it from the current transaction, decrementing any counters and * filing it on the clean list. */ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_block *sb, b_blocknr_t blocknr) { struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_journal_cnode *cn = NULL; struct buffer_head *bh = NULL; struct reiserfs_list_bitmap *jb = NULL; int cleaned = 0; BUG_ON(!th->t_trans_id); cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); if (cn && cn->bh) { bh = cn->bh; get_bh(bh); } /* if it is journal new, we just remove it from this transaction */ if (bh && buffer_journal_new(bh)) { clear_buffer_journal_new(bh); clear_prepared_bits(bh); reiserfs_clean_and_file_buffer(bh); cleaned = remove_from_transaction(sb, blocknr, cleaned); } else { /* * set the bit for this block in the journal bitmap * for this transaction */ jb = journal->j_current_jl->j_list_bitmap; if (!jb) { reiserfs_panic(sb, "journal-1702", "journal_list_bitmap is NULL"); } set_bit_in_list_bitmap(sb, blocknr, jb); /* Note, the entire while loop is not allowed to schedule. */ if (bh) { clear_prepared_bits(bh); reiserfs_clean_and_file_buffer(bh); } cleaned = remove_from_transaction(sb, blocknr, cleaned); /* * find all older transactions with this block, * make sure they don't try to write it out */ cn = get_journal_hash_dev(sb, journal->j_list_hash_table, blocknr); while (cn) { if (sb == cn->sb && blocknr == cn->blocknr) { set_bit(BLOCK_FREED, &cn->state); if (cn->bh) { /* * remove_from_transaction will brelse * the buffer if it was in the current * trans */ if (!cleaned) { clear_buffer_journal_dirty(cn-> bh); clear_buffer_dirty(cn->bh); clear_buffer_journal_test(cn-> bh); cleaned = 1; put_bh(cn->bh); if (atomic_read (&cn->bh->b_count) < 0) { reiserfs_warning(sb, "journal-2138", "cn->bh->b_count < 0"); } } /* * since we are clearing the bh, * we MUST dec nonzerolen */ if (cn->jlist) { atomic_dec(&cn->jlist-> j_nonzerolen); } cn->bh = NULL; } } cn = cn->hnext; } } if (bh) release_buffer_page(bh); /* get_hash grabs the buffer */ return 0; } void reiserfs_update_inode_transaction(struct inode *inode) { struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb); REISERFS_I(inode)->i_jl = journal->j_current_jl; REISERFS_I(inode)->i_trans_id = journal->j_trans_id; } /* * returns -1 on error, 0 if no commits/barriers were done and 1 * if a transaction was actually committed and the barrier was done */ static int __commit_trans_jl(struct inode *inode, unsigned long id, struct reiserfs_journal_list *jl) { struct reiserfs_transaction_handle th; struct super_block *sb = inode->i_sb; struct reiserfs_journal *journal = SB_JOURNAL(sb); int ret = 0; /* * is it from the current transaction, * or from an unknown transaction? */ if (id == journal->j_trans_id) { jl = journal->j_current_jl; /* * try to let other writers come in and * grow this transaction */ let_transaction_grow(sb, id); if (journal->j_trans_id != id) { goto flush_commit_only; } ret = journal_begin(&th, sb, 1); if (ret) return ret; /* someone might have ended this transaction while we joined */ if (journal->j_trans_id != id) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); ret = journal_end(&th); goto flush_commit_only; } ret = journal_end_sync(&th); if (!ret) ret = 1; } else { /* * this gets tricky, we have to make sure the journal list in * the inode still exists. We know the list is still around * if we've got a larger transaction id than the oldest list */ flush_commit_only: if (journal_list_still_alive(inode->i_sb, id)) { /* * we only set ret to 1 when we know for sure * the barrier hasn't been started yet on the commit * block. */ if (atomic_read(&jl->j_commit_left) > 1) ret = 1; flush_commit_list(sb, jl, 1); if (journal->j_errno) ret = journal->j_errno; } } /* otherwise the list is gone, and long since committed */ return ret; } int reiserfs_commit_for_inode(struct inode *inode) { unsigned int id = REISERFS_I(inode)->i_trans_id; struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; /* * for the whole inode, assume unset id means it was * changed in the current transaction. More conservative */ if (!id || !jl) { reiserfs_update_inode_transaction(inode); id = REISERFS_I(inode)->i_trans_id; /* jl will be updated in __commit_trans_jl */ } return __commit_trans_jl(inode, id, jl); } void reiserfs_restore_prepared_buffer(struct super_block *sb, struct buffer_head *bh) { struct reiserfs_journal *journal = SB_JOURNAL(sb); PROC_INFO_INC(sb, journal.restore_prepared); if (!bh) { return; } if (test_clear_buffer_journal_restore_dirty(bh) && buffer_journal_dirty(bh)) { struct reiserfs_journal_cnode *cn; reiserfs_write_lock(sb); cn = get_journal_hash_dev(sb, journal->j_list_hash_table, bh->b_blocknr); if (cn && can_dirty(cn)) { set_buffer_journal_test(bh); mark_buffer_dirty(bh); } reiserfs_write_unlock(sb); } clear_buffer_journal_prepared(bh); } extern struct tree_balance *cur_tb; /* * before we can change a metadata block, we have to make sure it won't * be written to disk while we are altering it. So, we must: * clean it * wait on it. */ int reiserfs_prepare_for_journal(struct super_block *sb, struct buffer_head *bh, int wait) { PROC_INFO_INC(sb, journal.prepare); if (!trylock_buffer(bh)) { if (!wait) return 0; lock_buffer(bh); } set_buffer_journal_prepared(bh); if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) { clear_buffer_journal_test(bh); set_buffer_journal_restore_dirty(bh); } unlock_buffer(bh); return 1; } /* * long and ugly. If flush, will not return until all commit * blocks and all real buffers in the trans are on disk. * If no_async, won't return until all commit blocks are on disk. * * keep reading, there are comments as you go along * * If the journal is aborted, we just clean up. Things like flushing * journal lists, etc just won't happen. */ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) { struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_journal_cnode *cn, *next, *jl_cn; struct reiserfs_journal_cnode *last_cn = NULL; struct reiserfs_journal_desc *desc; struct reiserfs_journal_commit *commit; struct buffer_head *c_bh; /* commit bh */ struct buffer_head *d_bh; /* desc bh */ int cur_write_start = 0; /* start index of current log write */ int i; int flush; int wait_on_commit; struct reiserfs_journal_list *jl, *temp_jl; struct list_head *entry, *safe; unsigned long jindex; unsigned int commit_trans_id; int trans_half; int depth; BUG_ON(th->t_refcount > 1); BUG_ON(!th->t_trans_id); BUG_ON(!th->t_super); /* * protect flush_older_commits from doing mistakes if the * transaction ID counter gets overflowed. */ if (th->t_trans_id == ~0U) flags |= FLUSH_ALL | COMMIT_NOW | WAIT; flush = flags & FLUSH_ALL; wait_on_commit = flags & WAIT; current->journal_info = th->t_handle_save; reiserfs_check_lock_depth(sb, "journal end"); if (journal->j_len == 0) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); } lock_journal(sb); if (journal->j_next_full_flush) { flags |= FLUSH_ALL; flush = 1; } if (journal->j_next_async_flush) { flags |= COMMIT_NOW | WAIT; wait_on_commit = 1; } /* * check_journal_end locks the journal, and unlocks if it does * not return 1 it tells us if we should continue with the * journal_end, or just return */ if (!check_journal_end(th, flags)) { reiserfs_schedule_old_flush(sb); wake_queued_writers(sb); reiserfs_async_progress_wait(sb); goto out; } /* check_journal_end might set these, check again */ if (journal->j_next_full_flush) { flush = 1; } /* * j must wait means we have to flush the log blocks, and the * real blocks for this transaction */ if (journal->j_must_wait > 0) { flush = 1; } #ifdef REISERFS_PREALLOCATE /* * quota ops might need to nest, setup the journal_info pointer * for them and raise the refcount so that it is > 0. */ current->journal_info = th; th->t_refcount++; /* it should not involve new blocks into the transaction */ reiserfs_discard_all_prealloc(th); th->t_refcount--; current->journal_info = th->t_handle_save; #endif /* setup description block */ d_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start); set_buffer_uptodate(d_bh); desc = (struct reiserfs_journal_desc *)(d_bh)->b_data; memset(d_bh->b_data, 0, d_bh->b_size); memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8); set_desc_trans_id(desc, journal->j_trans_id); /* * setup commit block. Don't write (keep it clean too) this one * until after everyone else is written */ c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + ((journal->j_start + journal->j_len + 1) % SB_ONDISK_JOURNAL_SIZE(sb))); commit = (struct reiserfs_journal_commit *)c_bh->b_data; memset(c_bh->b_data, 0, c_bh->b_size); set_commit_trans_id(commit, journal->j_trans_id); set_buffer_uptodate(c_bh); /* init this journal list */ jl = journal->j_current_jl; /* * we lock the commit before doing anything because * we want to make sure nobody tries to run flush_commit_list until * the new transaction is fully setup, and we've already flushed the * ordered bh list */ reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb); /* save the transaction id in case we need to commit it later */ commit_trans_id = jl->j_trans_id; atomic_set(&jl->j_older_commits_done, 0); jl->j_trans_id = journal->j_trans_id; jl->j_timestamp = journal->j_trans_start_time; jl->j_commit_bh = c_bh; jl->j_start = journal->j_start; jl->j_len = journal->j_len; atomic_set(&jl->j_nonzerolen, journal->j_len); atomic_set(&jl->j_commit_left, journal->j_len + 2); jl->j_realblock = NULL; /* * The ENTIRE FOR LOOP MUST not cause schedule to occur. * for each real block, add it to the journal list hash, * copy into real block index array in the commit or desc block */ trans_half = journal_trans_half(sb->s_blocksize); for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) { if (buffer_journaled(cn->bh)) { jl_cn = get_cnode(sb); if (!jl_cn) { reiserfs_panic(sb, "journal-1676", "get_cnode returned NULL"); } if (i == 0) { jl->j_realblock = jl_cn; } jl_cn->prev = last_cn; jl_cn->next = NULL; if (last_cn) { last_cn->next = jl_cn; } last_cn = jl_cn; /* * make sure the block we are trying to log * is not a block of journal or reserved area */ if (is_block_in_log_or_reserved_area (sb, cn->bh->b_blocknr)) { reiserfs_panic(sb, "journal-2332", "Trying to log block %lu, " "which is a log block", cn->bh->b_blocknr); } jl_cn->blocknr = cn->bh->b_blocknr; jl_cn->state = 0; jl_cn->sb = sb; jl_cn->bh = cn->bh; jl_cn->jlist = jl; insert_journal_hash(journal->j_list_hash_table, jl_cn); if (i < trans_half) { desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr); } else { commit->j_realblock[i - trans_half] = cpu_to_le32(cn->bh->b_blocknr); } } else { i--; } } set_desc_trans_len(desc, journal->j_len); set_desc_mount_id(desc, journal->j_mount_id); set_desc_trans_id(desc, journal->j_trans_id); set_commit_trans_len(commit, journal->j_len); /* * special check in case all buffers in the journal * were marked for not logging */ BUG_ON(journal->j_len == 0); /* * we're about to dirty all the log blocks, mark the description block * dirty now too. Don't mark the commit block dirty until all the * others are on disk */ mark_buffer_dirty(d_bh); /* * first data block is j_start + 1, so add one to * cur_write_start wherever you use it */ cur_write_start = journal->j_start; cn = journal->j_first; jindex = 1; /* start at one so we don't get the desc again */ while (cn) { clear_buffer_journal_new(cn->bh); /* copy all the real blocks into log area. dirty log blocks */ if (buffer_journaled(cn->bh)) { struct buffer_head *tmp_bh; char *addr; struct page *page; tmp_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(sb))); set_buffer_uptodate(tmp_bh); page = cn->bh->b_page; addr = kmap(page); memcpy(tmp_bh->b_data, addr + offset_in_page(cn->bh->b_data), cn->bh->b_size); kunmap(page); mark_buffer_dirty(tmp_bh); jindex++; set_buffer_journal_dirty(cn->bh); clear_buffer_journaled(cn->bh); } else { /* * JDirty cleared sometime during transaction. * don't log this one */ reiserfs_warning(sb, "journal-2048", "BAD, buffer in journal hash, " "but not JDirty!"); brelse(cn->bh); } next = cn->next; free_cnode(sb, cn); cn = next; reiserfs_cond_resched(sb); } /* * we are done with both the c_bh and d_bh, but * c_bh must be written after all other commit blocks, * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. */ journal->j_current_jl = alloc_journal_list(sb); /* now it is safe to insert this transaction on the main list */ list_add_tail(&jl->j_list, &journal->j_journal_list); list_add_tail(&jl->j_working_list, &journal->j_working_list); journal->j_num_work_lists++; /* reset journal values for the next transaction */ journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(sb); atomic_set(&journal->j_wcount, 0); journal->j_bcount = 0; journal->j_last = NULL; journal->j_first = NULL; journal->j_len = 0; journal->j_trans_start_time = 0; /* check for trans_id overflow */ if (++journal->j_trans_id == 0) journal->j_trans_id = 10; journal->j_current_jl->j_trans_id = journal->j_trans_id; journal->j_must_wait = 0; journal->j_len_alloc = 0; journal->j_next_full_flush = 0; journal->j_next_async_flush = 0; init_journal_hash(sb); /* * make sure reiserfs_add_jh sees the new current_jl before we * write out the tails */ smp_mb(); /* * tail conversion targets have to hit the disk before we end the * transaction. Otherwise a later transaction might repack the tail * before this transaction commits, leaving the data block unflushed * and clean, if we crash before the later transaction commits, the * data block is lost. */ if (!list_empty(&jl->j_tail_bh_list)) { depth = reiserfs_write_unlock_nested(sb); write_ordered_buffers(&journal->j_dirty_buffers_lock, journal, jl, &jl->j_tail_bh_list); reiserfs_write_lock_nested(sb, depth); } BUG_ON(!list_empty(&jl->j_tail_bh_list)); mutex_unlock(&jl->j_commit_mutex); /* * honor the flush wishes from the caller, simple commits can * be done outside the journal lock, they are done below * * if we don't flush the commit list right now, we put it into * the work queue so the people waiting on the async progress work * queue don't wait for this proc to flush journal lists and such. */ if (flush) { flush_commit_list(sb, jl, 1); flush_journal_list(sb, jl, 1); } else if (!(jl->j_state & LIST_COMMIT_PENDING)) { /* * Avoid queueing work when sb is being shut down. Transaction * will be flushed on journal shutdown. */ if (sb->s_flags & SB_ACTIVE) queue_delayed_work(REISERFS_SB(sb)->commit_wq, &journal->j_work, HZ / 10); } /* * if the next transaction has any chance of wrapping, flush * transactions that might get overwritten. If any journal lists * are very old flush them as well. */ first_jl: list_for_each_safe(entry, safe, &journal->j_journal_list) { temp_jl = JOURNAL_LIST_ENTRY(entry); if (journal->j_start <= temp_jl->j_start) { if ((journal->j_start + journal->j_trans_max + 1) >= temp_jl->j_start) { flush_used_journal_lists(sb, temp_jl); goto first_jl; } else if ((journal->j_start + journal->j_trans_max + 1) < SB_ONDISK_JOURNAL_SIZE(sb)) { /* * if we don't cross into the next * transaction and we don't wrap, there is * no way we can overlap any later transactions * break now */ break; } } else if ((journal->j_start + journal->j_trans_max + 1) > SB_ONDISK_JOURNAL_SIZE(sb)) { if (((journal->j_start + journal->j_trans_max + 1) % SB_ONDISK_JOURNAL_SIZE(sb)) >= temp_jl->j_start) { flush_used_journal_lists(sb, temp_jl); goto first_jl; } else { /* * we don't overlap anything from out start * to the end of the log, and our wrapped * portion doesn't overlap anything at * the start of the log. We can break */ break; } } } journal->j_current_jl->j_list_bitmap = get_list_bitmap(sb, journal->j_current_jl); if (!(journal->j_current_jl->j_list_bitmap)) { reiserfs_panic(sb, "journal-1996", "could not get a list bitmap"); } atomic_set(&journal->j_jlock, 0); unlock_journal(sb); /* wake up any body waiting to join. */ clear_bit(J_WRITERS_QUEUED, &journal->j_state); wake_up(&journal->j_join_wait); if (!flush && wait_on_commit && journal_list_still_alive(sb, commit_trans_id)) { flush_commit_list(sb, jl, 1); } out: reiserfs_check_lock_depth(sb, "journal end2"); memset(th, 0, sizeof(*th)); /* * Re-set th->t_super, so we can properly keep track of how many * persistent transactions there are. We need to do this so if this * call is part of a failed restart_transaction, we can free it later */ th->t_super = sb; return journal->j_errno; } /* Send the file system read only and refuse new transactions */ void reiserfs_abort_journal(struct super_block *sb, int errno) { struct reiserfs_journal *journal = SB_JOURNAL(sb); if (test_bit(J_ABORTED, &journal->j_state)) return; if (!journal->j_errno) journal->j_errno = errno; sb->s_flags |= SB_RDONLY; set_bit(J_ABORTED, &journal->j_state); #ifdef CONFIG_REISERFS_CHECK dump_stack(); #endif } |
83 459 199 280 222 1 1 18 160 24 21 3 92 11 83 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Authors: Lotsa people, from code originally in tcp */ #ifndef _INET_HASHTABLES_H #define _INET_HASHTABLES_H #include <linux/interrupt.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/wait.h> #include <net/inet_connection_sock.h> #include <net/inet_sock.h> #include <net/ip.h> #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/netns/hash.h> #include <linux/refcount.h> #include <asm/byteorder.h> /* This is for all connections with a full identity, no wildcards. * The 'e' prefix stands for Establish, but we really put all sockets * but LISTEN ones. */ struct inet_ehash_bucket { struct hlist_nulls_head chain; }; /* There are a few simple rules, which allow for local port reuse by * an application. In essence: * * 1) Sockets bound to different interfaces may share a local port. * Failing that, goto test 2. * 2) If all sockets have sk->sk_reuse set, and none of them are in * TCP_LISTEN state, the port may be shared. * Failing that, goto test 3. * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local * address, and none of them are the same, the port may be * shared. * Failing this, the port cannot be shared. * * The interesting point, is test #2. This is what an FTP server does * all day. To optimize this case we use a specific flag bit defined * below. As we add sockets to a bind bucket list, we perform a * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) * As long as all sockets added to a bind bucket pass this test, * the flag bit will be set. * The resulting situation is that tcp_v[46]_verify_bind() can just check * for this flag bit, if it is set and the socket trying to bind has * sk->sk_reuse set, we don't even have to walk the owners list at all, * we return that it is ok to bind this socket to the requested local port. * * Sounds like a lot of work, but it is worth it. In a more naive * implementation (ie. current FreeBSD etc.) the entire list of ports * must be walked for each data port opened by an ftp server. Needless * to say, this does not scale at all. With a couple thousand FTP * users logged onto your box, isn't it nice to know that new data * ports are created in O(1) time? I thought so. ;-) -DaveM */ #define FASTREUSEPORT_ANY 1 #define FASTREUSEPORT_STRICT 2 struct inet_bind_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; kuid_t fastuid; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr fast_v6_rcv_saddr; #endif __be32 fast_rcv_saddr; unsigned short fast_sk_family; bool fast_ipv6_only; struct hlist_node node; struct hlist_head bhash2; }; struct inet_bind2_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) unsigned short addr_type; struct in6_addr v6_rcv_saddr; #define rcv_saddr v6_rcv_saddr.s6_addr32[3] #else __be32 rcv_saddr; #endif /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) { return read_pnet(&ib->ib_net); } static inline struct net *ib2_net(const struct inet_bind2_bucket *ib) { return read_pnet(&ib->ib_net); } #define inet_bind_bucket_for_each(tb, head) \ hlist_for_each_entry(tb, head, node) struct inet_bind_hashbucket { spinlock_t lock; struct hlist_head chain; }; /* Sockets can be hashed in established or listening table. * We must use different 'nulls' end-of-chain value for all hash buckets : * A socket might transition from ESTABLISH to LISTEN state without * RCU grace period. A lookup in ehash table needs to handle this case. */ #define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; struct hlist_nulls_head nulls_head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ struct inet_hashinfo { /* This is for sockets with full identity only. Sockets here will * always be without wildcards and will have the following invariant: * * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE * */ struct inet_ehash_bucket *ehash; spinlock_t *ehash_locks; unsigned int ehash_mask; unsigned int ehash_locks_mask; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. */ struct kmem_cache *bind_bucket_cachep; /* This bind table is hashed by local port */ struct inet_bind_hashbucket *bhash; struct kmem_cache *bind2_bucket_cachep; /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4) * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used * primarily for expediting bind conflict resolution. */ struct inet_bind_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ unsigned int lhash2_mask; struct inet_listen_hashbucket *lhash2; bool pernet; } ____cacheline_aligned_in_smp; static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) { #if IS_ENABLED(CONFIG_IP_DCCP) return sk->sk_prot->h.hashinfo ? : sock_net(sk)->ipv4.tcp_death_row.hashinfo; #else return sock_net(sk)->ipv4.tcp_death_row.hashinfo; #endif } static inline struct inet_listen_hashbucket * inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) { return &h->lhash2[hash & h->lhash2_mask]; } static inline struct inet_ehash_bucket *inet_ehash_bucket( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash[hash & hashinfo->ehash_mask]; } static inline spinlock_t *inet_ehash_lockp( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; } int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h) { kfree(h->lhash2); h->lhash2 = NULL; } static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { kvfree(hashinfo->ehash_locks); hashinfo->ehash_locks = NULL; } struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo); struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, int l3mdev); struct inet_bind2_bucket * inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk); void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb); struct inet_bind2_bucket * inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) { return (lport + net_hash_mix(net)) & (bhash_size - 1); } static inline struct inet_bind_hashbucket * inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk, const struct net *net, unsigned short port) { u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port); else #endif hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port); return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port); /* This should be called whenever a socket's sk_rcv_saddr (ipv4) or * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's * rcv_saddr field should already have been updated when this is called. */ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family); void inet_bhash2_reset_saddr(struct sock *sk); void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port); /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); void inet_put_port(struct sock *sk); void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit); int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); int __inet_hash(struct sock *sk, struct sock *osk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif); static inline struct sock *inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif) { return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, ntohs(dport), dif, sdif); } /* Socket demux engine toys. */ /* What happens here is ugly; there's a pair of adjacent fields in struct inet_sock; __be16 dport followed by __u16 num. We want to search by pair, so we combine the keys into a single 32bit value and compare with 32bit value read from &...->dport. Let's at least make sure that it's not mixed with anything else... On 64bit targets we combine comparisons with pair of adjacent __be32 fields in the same way. */ #ifdef __BIG_ENDIAN #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport))) #else /* __LITTLE_ENDIAN */ #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport))) #endif #ifdef __BIG_ENDIAN #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__saddr)) << 32) | \ ((__force __u64)(__be32)(__daddr))) #else /* __LITTLE_ENDIAN */ #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__daddr)) << 32) | \ ((__force __u64)(__be32)(__saddr))) #endif /* __BIG_ENDIAN */ static inline bool inet_match(struct net *net, const struct sock *sk, const __addrpair cookie, const __portpair ports, int dif, int sdif) { if (!net_eq(sock_net(sk), net) || sk->sk_portpair != ports || sk->sk_addrpair != cookie) return false; /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */ return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif); } /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM */ struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); typedef u32 (inet_ehashfn_t)(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport); inet_ehashfn_t inet_ehashfn; INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn); struct sock *inet_lookup_run_sk_lookup(struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn); static inline struct sock * inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { return __inet_lookup_established(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif, 0); } static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif, const int sdif, bool *refcounted) { u16 hnum = ntohs(dport); struct sock *sk; sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { struct sock *sk; bool refcounted; sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, dport, dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } static inline struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, bool *refcounted, inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool prefetched; sk = skb_steal_sock(skb, refcounted, &prefetched); if (!sk) return NULL; if (!prefetched || !sk_fullsock(sk)) return sk; if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_state != TCP_LISTEN) return sk; } else if (sk->sk_protocol == IPPROTO_UDP) { if (sk->sk_state != TCP_CLOSE) return sk; } else { return sk; } reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, ntohs(dport), ehashfn); if (!reuse_sk) return sk; /* We've chosen a new reuseport sock which is never refcounted. This * implies that sk also isn't refcounted. */ WARN_ON_ONCE(*refcounted); return reuse_sk; } static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, const int sdif, bool *refcounted) { struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport, refcounted, inet_ehashfn); if (IS_ERR(sk)) return NULL; if (sk) return sk; return __inet_lookup(net, hashinfo, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } static inline void sk_daddr_set(struct sock *sk, __be32 addr) { sk->sk_daddr = addr; /* alias of inet_daddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr); #endif } static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) { sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr); #endif } int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); #endif /* _INET_HASHTABLES_H */ |
1 2 1 2 2 2 2 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | // SPDX-License-Identifier: GPL-2.0-only /* * TCP HYBLA * * TCP-HYBLA Congestion control algorithm, based on: * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement * for Heterogeneous Networks", * International Journal on satellite Communications, * September 2004 * Daniele Lacamera * root at danielinux.net */ #include <linux/module.h> #include <net/tcp.h> /* Tcp Hybla structure. */ struct hybla { bool hybla_en; u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ u32 rho; /* Rho parameter, integer part */ u32 rho2; /* Rho * Rho, integer part */ u32 rho_3ls; /* Rho parameter, <<3 */ u32 rho2_7ls; /* Rho^2, <<7 */ u32 minrtt_us; /* Minimum smoothed round trip time value seen */ }; /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ static int rtt0 = 25; module_param(rtt0, int, 0644); MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); /* This is called to refresh values for hybla parameters */ static inline void hybla_recalc_param (struct sock *sk) { struct hybla *ca = inet_csk_ca(sk); ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC), 8U); ca->rho = ca->rho_3ls >> 3; ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; ca->rho2 = ca->rho2_7ls >> 7; } static void hybla_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); ca->rho = 0; ca->rho2 = 0; ca->rho_3ls = 0; ca->rho2_7ls = 0; ca->snd_cwnd_cents = 0; ca->hybla_en = true; tcp_snd_cwnd_set(tp, 2); tp->snd_cwnd_clamp = 65535; /* 1st Rho measurement based on initial srtt */ hybla_recalc_param(sk); /* set minimum rtt as this is the 1st ever seen */ ca->minrtt_us = tp->srtt_us; tcp_snd_cwnd_set(tp, ca->rho); } static void hybla_state(struct sock *sk, u8 ca_state) { struct hybla *ca = inet_csk_ca(sk); ca->hybla_en = (ca_state == TCP_CA_Open); } static inline u32 hybla_fraction(u32 odds) { static const u32 fractions[] = { 128, 139, 152, 165, 181, 197, 215, 234, }; return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128; } /* TCP Hybla main routine. * This is the algorithm behavior: * o Recalc Hybla parameters if min_rtt has changed * o Give cwnd a new value based on the model proposed * o remember increments <1 */ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); u32 increment, odd, rho_fractions; int is_slowstart = 0; /* Recalculate rho only if this srtt is the lowest */ if (tp->srtt_us < ca->minrtt_us) { hybla_recalc_param(sk); ca->minrtt_us = tp->srtt_us; } if (!tcp_is_cwnd_limited(sk)) return; if (!ca->hybla_en) { tcp_reno_cong_avoid(sk, ack, acked); return; } if (ca->rho == 0) hybla_recalc_param(sk); rho_fractions = ca->rho_3ls - (ca->rho << 3); if (tcp_in_slow_start(tp)) { /* * slow start * INC = 2^RHO - 1 * This is done by splitting the rho parameter * into 2 parts: an integer part and a fraction part. * Inrement<<7 is estimated by doing: * [2^(int+fract)]<<7 * that is equal to: * (2^int) * [(2^fract) <<7] * 2^int is straightly computed as 1<<int, * while we will use hybla_slowstart_fraction_increment() to * calculate 2^fract in a <<7 value. */ is_slowstart = 1; increment = ((1 << min(ca->rho, 16U)) * hybla_fraction(rho_fractions)) - 128; } else { /* * congestion avoidance * INC = RHO^2 / W * as long as increment is estimated as (rho<<7)/window * it already is <<7 and we can easily count its fractions. */ increment = ca->rho2_7ls / tcp_snd_cwnd(tp); if (increment < 128) tp->snd_cwnd_cnt++; } odd = increment % 128; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + (increment >> 7)); ca->snd_cwnd_cents += odd; /* check when fractions goes >=128 and increase cwnd by 1. */ while (ca->snd_cwnd_cents >= 128) { tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); ca->snd_cwnd_cents -= 128; tp->snd_cwnd_cnt = 0; } /* check when cwnd has not been incremented for a while */ if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) { tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); tp->snd_cwnd_cnt = 0; } /* clamp down slowstart cwnd to ssthresh value. */ if (is_slowstart) tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_ssthresh)); tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = hybla_cong_avoid, .set_state = hybla_state, .owner = THIS_MODULE, .name = "hybla" }; static int __init hybla_register(void) { BUILD_BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_hybla); } static void __exit hybla_unregister(void) { tcp_unregister_congestion_control(&tcp_hybla); } module_init(hybla_register); module_exit(hybla_unregister); MODULE_AUTHOR("Daniele Lacamera"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP Hybla"); |
37 6 3 7 5 317 1 326 235 203 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2009-2021 Christoph Hellwig * * NOTE: none of these tracepoints shall be considered a stable kernel ABI * as they can change at any time. * * Current conventions for printing numbers measuring specific units: * * offset: byte offset into a subcomponent of a file operation * pos: file offset, in bytes * length: length of a file operation, in bytes * ino: inode number * * Numbers describing space allocations should be formatted in hexadecimal. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM iomap #if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _IOMAP_TRACE_H #include <linux/tracepoint.h> struct inode; DECLARE_EVENT_CLASS(iomap_readpage_class, TP_PROTO(struct inode *inode, int nr_pages), TP_ARGS(inode, nr_pages), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(int, nr_pages) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nr_pages = nr_pages; ), TP_printk("dev %d:%d ino 0x%llx nr_pages %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->nr_pages) ) #define DEFINE_READPAGE_EVENT(name) \ DEFINE_EVENT(iomap_readpage_class, name, \ TP_PROTO(struct inode *inode, int nr_pages), \ TP_ARGS(inode, nr_pages)) DEFINE_READPAGE_EVENT(iomap_readpage); DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, TP_PROTO(struct inode *inode, loff_t off, u64 len), TP_ARGS(inode, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, size) __field(loff_t, offset) __field(u64, length) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->size = i_size_read(inode); __entry->offset = off; __entry->length = len; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx length 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->length) ) #define DEFINE_RANGE_EVENT(name) \ DEFINE_EVENT(iomap_range_class, name, \ TP_PROTO(struct inode *inode, loff_t off, u64 len),\ TP_ARGS(inode, off, len)) DEFINE_RANGE_EVENT(iomap_writepage); DEFINE_RANGE_EVENT(iomap_release_folio); DEFINE_RANGE_EVENT(iomap_invalidate_folio); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); DEFINE_RANGE_EVENT(iomap_dio_rw_queued); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ { IOMAP_DELALLOC, "DELALLOC" }, \ { IOMAP_MAPPED, "MAPPED" }, \ { IOMAP_UNWRITTEN, "UNWRITTEN" }, \ { IOMAP_INLINE, "INLINE" } #define IOMAP_FLAGS_STRINGS \ { IOMAP_WRITE, "WRITE" }, \ { IOMAP_ZERO, "ZERO" }, \ { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ { IOMAP_NOWAIT, "NOWAIT" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ { IOMAP_F_DIRTY, "DIRTY" }, \ { IOMAP_F_SHARED, "SHARED" }, \ { IOMAP_F_MERGED, "MERGED" }, \ { IOMAP_F_BUFFER_HEAD, "BH" }, \ { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" } #define IOMAP_DIO_STRINGS \ {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" } DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), TP_ARGS(inode, iomap), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(u64, addr) __field(loff_t, offset) __field(u64, length) __field(u16, type) __field(u16, flags) __field(dev_t, bdev) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->addr = iomap->addr; __entry->offset = iomap->offset; __entry->length = iomap->length; __entry->type = iomap->type; __entry->flags = iomap->flags; __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx " "length 0x%llx type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), __entry->addr, __entry->offset, __entry->length, __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) ) #define DEFINE_IOMAP_EVENT(name) \ DEFINE_EVENT(iomap_class, name, \ TP_PROTO(struct inode *inode, struct iomap *iomap), \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_iter_dstmap); DEFINE_IOMAP_EVENT(iomap_iter_srcmap); TRACE_EVENT(iomap_writepage_map, TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len, struct iomap *iomap), TP_ARGS(inode, pos, dirty_len, iomap), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(u64, pos) __field(u64, dirty_len) __field(u64, addr) __field(loff_t, offset) __field(u64, length) __field(u16, type) __field(u16, flags) __field(dev_t, bdev) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->dirty_len = dirty_len; __entry->addr = iomap->addr; __entry->offset = iomap->offset; __entry->length = iomap->length; __entry->type = iomap->type; __entry->flags = iomap->flags; __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx " "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), __entry->pos, __entry->dirty_len, __entry->addr, __entry->offset, __entry->length, __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) ); TRACE_EVENT(iomap_iter, TP_PROTO(struct iomap_iter *iter, const void *ops, unsigned long caller), TP_ARGS(iter, ops, caller), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, pos) __field(u64, length) __field(s64, processed) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) ), TP_fast_assign( __entry->dev = iter->inode->i_sb->s_dev; __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); __entry->processed = iter->processed; __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, __entry->processed, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, (void *)__entry->caller) ); TRACE_EVENT(iomap_dio_rw_begin, TP_PROTO(struct kiocb *iocb, struct iov_iter *iter, unsigned int dio_flags, size_t done_before), TP_ARGS(iocb, iter, dio_flags, done_before), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, isize) __field(loff_t, pos) __field(size_t, count) __field(size_t, done_before) __field(int, ki_flags) __field(unsigned int, dio_flags) __field(bool, aio) ), TP_fast_assign( __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; __entry->ino = file_inode(iocb->ki_filp)->i_ino; __entry->isize = file_inode(iocb->ki_filp)->i_size; __entry->pos = iocb->ki_pos; __entry->count = iov_iter_count(iter); __entry->done_before = done_before; __entry->ki_flags = iocb->ki_flags; __entry->dio_flags = dio_flags; __entry->aio = !is_sync_kiocb(iocb); ), TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx length 0x%zx done_before 0x%zx flags %s dio_flags %s aio %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, __entry->pos, __entry->count, __entry->done_before, __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), __print_flags(__entry->dio_flags, "|", IOMAP_DIO_STRINGS), __entry->aio) ); TRACE_EVENT(iomap_dio_complete, TP_PROTO(struct kiocb *iocb, int error, ssize_t ret), TP_ARGS(iocb, error, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, isize) __field(loff_t, pos) __field(int, ki_flags) __field(bool, aio) __field(int, error) __field(ssize_t, ret) ), TP_fast_assign( __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; __entry->ino = file_inode(iocb->ki_filp)->i_ino; __entry->isize = file_inode(iocb->ki_filp)->i_size; __entry->pos = iocb->ki_pos; __entry->ki_flags = iocb->ki_flags; __entry->aio = !is_sync_kiocb(iocb); __entry->error = error; __entry->ret = ret; ), TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx flags %s aio %d error %d ret %zd", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, __entry->pos, __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), __entry->aio, __entry->error, __entry->ret) ); #endif /* _IOMAP_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | // SPDX-License-Identifier: GPL-2.0 /****************************************************************************** * rtl8712_io.c * * Copyright(c) 2007 - 2010 Realtek Corporation. All rights reserved. * Linux device driver for RTL8192SU * * Modifications for inclusion into the Linux staging tree are * Copyright(c) 2010 Larry Finger. All rights reserved. * * Contact information: * WLAN FAE <wlanfae@realtek.com>. * Larry Finger <Larry.Finger@lwfinger.net> * ******************************************************************************/ #define _RTL8712_IO_C_ #include "osdep_service.h" #include "drv_types.h" #include "rtl871x_io.h" #include "osdep_intf.h" #include "usb_ops.h" u8 r8712_read8(struct _adapter *adapter, u32 addr) { struct intf_hdl *hdl = &adapter->pio_queue->intf; return hdl->io_ops._read8(hdl, addr); } u16 r8712_read16(struct _adapter *adapter, u32 addr) { struct intf_hdl *hdl = &adapter->pio_queue->intf; return hdl->io_ops._read16(hdl, addr); } u32 r8712_read32(struct _adapter *adapter, u32 addr) { struct intf_hdl *hdl = &adapter->pio_queue->intf; return hdl->io_ops._read32(hdl, addr); } void r8712_write8(struct _adapter *adapter, u32 addr, u8 val) { struct intf_hdl *hdl = &adapter->pio_queue->intf; hdl->io_ops._write8(hdl, addr, val); } void r8712_write16(struct _adapter *adapter, u32 addr, u16 val) { struct intf_hdl *hdl = &adapter->pio_queue->intf; hdl->io_ops._write16(hdl, addr, val); } void r8712_write32(struct _adapter *adapter, u32 addr, u32 val) { struct intf_hdl *hdl = &adapter->pio_queue->intf; hdl->io_ops._write32(hdl, addr, val); } void r8712_read_mem(struct _adapter *adapter, u32 addr, u32 cnt, u8 *pmem) { struct intf_hdl *hdl = &adapter->pio_queue->intf; if (adapter->driver_stopped || adapter->surprise_removed) return; hdl->io_ops._read_mem(hdl, addr, cnt, pmem); } void r8712_write_mem(struct _adapter *adapter, u32 addr, u32 cnt, u8 *pmem) { struct intf_hdl *hdl = &adapter->pio_queue->intf; hdl->io_ops._write_mem(hdl, addr, cnt, pmem); } void r8712_read_port(struct _adapter *adapter, u32 addr, u32 cnt, u8 *pmem) { struct intf_hdl *hdl = &adapter->pio_queue->intf; if (adapter->driver_stopped || adapter->surprise_removed) return; hdl->io_ops._read_port(hdl, addr, cnt, pmem); } void r8712_write_port(struct _adapter *adapter, u32 addr, u32 cnt, u8 *pmem) { struct intf_hdl *hdl = &adapter->pio_queue->intf; hdl->io_ops._write_port(hdl, addr, cnt, pmem); } |
3 3 2 2 2 2 3 3 3 3 7 15 6 6 4 11 7 5 4 2 2 3 9 2 9 11 15 7 7 7 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic MIDI synth driver for ALSA sequencer * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> */ /* Possible options for midisynth module: - automatic opening of midi ports on first received event or subscription (close will be performed when client leaves) */ #include <linux/init.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/module.h> #include <linux/mutex.h> #include <sound/core.h> #include <sound/rawmidi.h> #include <sound/seq_kernel.h> #include <sound/seq_device.h> #include <sound/seq_midi_event.h> #include <sound/initval.h> MODULE_AUTHOR("Frank van de Pol <fvdpol@coil.demon.nl>, Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("Advanced Linux Sound Architecture sequencer MIDI synth."); MODULE_LICENSE("GPL"); static int output_buffer_size = PAGE_SIZE; module_param(output_buffer_size, int, 0644); MODULE_PARM_DESC(output_buffer_size, "Output buffer size in bytes."); static int input_buffer_size = PAGE_SIZE; module_param(input_buffer_size, int, 0644); MODULE_PARM_DESC(input_buffer_size, "Input buffer size in bytes."); /* data for this midi synth driver */ struct seq_midisynth { struct snd_card *card; struct snd_rawmidi *rmidi; int device; int subdevice; struct snd_rawmidi_file input_rfile; struct snd_rawmidi_file output_rfile; int seq_client; int seq_port; struct snd_midi_event *parser; }; struct seq_midisynth_client { int seq_client; int num_ports; int ports_per_device[SNDRV_RAWMIDI_DEVICES]; struct seq_midisynth *ports[SNDRV_RAWMIDI_DEVICES]; }; static struct seq_midisynth_client *synths[SNDRV_CARDS]; static DEFINE_MUTEX(register_mutex); /* handle rawmidi input event (MIDI v1.0 stream) */ static void snd_midi_input_event(struct snd_rawmidi_substream *substream) { struct snd_rawmidi_runtime *runtime; struct seq_midisynth *msynth; struct snd_seq_event ev; char buf[16], *pbuf; long res; if (substream == NULL) return; runtime = substream->runtime; msynth = runtime->private_data; if (msynth == NULL) return; memset(&ev, 0, sizeof(ev)); while (runtime->avail > 0) { res = snd_rawmidi_kernel_read(substream, buf, sizeof(buf)); if (res <= 0) continue; if (msynth->parser == NULL) continue; pbuf = buf; while (res-- > 0) { if (!snd_midi_event_encode_byte(msynth->parser, *pbuf++, &ev)) continue; ev.source.port = msynth->seq_port; ev.dest.client = SNDRV_SEQ_ADDRESS_SUBSCRIBERS; snd_seq_kernel_client_dispatch(msynth->seq_client, &ev, 1, 0); /* clear event and reset header */ memset(&ev, 0, sizeof(ev)); } } } static int dump_midi(struct snd_rawmidi_substream *substream, const char *buf, int count) { struct snd_rawmidi_runtime *runtime; int tmp; if (snd_BUG_ON(!substream || !buf)) return -EINVAL; runtime = substream->runtime; tmp = runtime->avail; if (tmp < count) { if (printk_ratelimit()) pr_err("ALSA: seq_midi: MIDI output buffer overrun\n"); return -ENOMEM; } if (snd_rawmidi_kernel_write(substream, buf, count) < count) return -EINVAL; return 0; } /* callback for snd_seq_dump_var_event(), bridging to dump_midi() */ static int __dump_midi(void *ptr, void *buf, int count) { return dump_midi(ptr, buf, count); } static int event_process_midi(struct snd_seq_event *ev, int direct, void *private_data, int atomic, int hop) { struct seq_midisynth *msynth = private_data; unsigned char msg[10]; /* buffer for constructing midi messages */ struct snd_rawmidi_substream *substream; int len; if (snd_BUG_ON(!msynth)) return -EINVAL; substream = msynth->output_rfile.output; if (substream == NULL) return -ENODEV; if (ev->type == SNDRV_SEQ_EVENT_SYSEX) { /* special case, to save space */ if ((ev->flags & SNDRV_SEQ_EVENT_LENGTH_MASK) != SNDRV_SEQ_EVENT_LENGTH_VARIABLE) { /* invalid event */ pr_debug("ALSA: seq_midi: invalid sysex event flags = 0x%x\n", ev->flags); return 0; } snd_seq_dump_var_event(ev, __dump_midi, substream); snd_midi_event_reset_decode(msynth->parser); } else { if (msynth->parser == NULL) return -EIO; len = snd_midi_event_decode(msynth->parser, msg, sizeof(msg), ev); if (len < 0) return 0; if (dump_midi(substream, msg, len) < 0) snd_midi_event_reset_decode(msynth->parser); } return 0; } static int snd_seq_midisynth_new(struct seq_midisynth *msynth, struct snd_card *card, int device, int subdevice) { if (snd_midi_event_new(MAX_MIDI_EVENT_BUF, &msynth->parser) < 0) return -ENOMEM; msynth->card = card; msynth->device = device; msynth->subdevice = subdevice; return 0; } /* open associated midi device for input */ static int midisynth_subscribe(void *private_data, struct snd_seq_port_subscribe *info) { int err; struct seq_midisynth *msynth = private_data; struct snd_rawmidi_runtime *runtime; struct snd_rawmidi_params params; /* open midi port */ err = snd_rawmidi_kernel_open(msynth->rmidi, msynth->subdevice, SNDRV_RAWMIDI_LFLG_INPUT, &msynth->input_rfile); if (err < 0) { pr_debug("ALSA: seq_midi: midi input open failed!!!\n"); return err; } runtime = msynth->input_rfile.input->runtime; memset(¶ms, 0, sizeof(params)); params.avail_min = 1; params.buffer_size = input_buffer_size; err = snd_rawmidi_input_params(msynth->input_rfile.input, ¶ms); if (err < 0) { snd_rawmidi_kernel_release(&msynth->input_rfile); return err; } snd_midi_event_reset_encode(msynth->parser); runtime->event = snd_midi_input_event; runtime->private_data = msynth; snd_rawmidi_kernel_read(msynth->input_rfile.input, NULL, 0); return 0; } /* close associated midi device for input */ static int midisynth_unsubscribe(void *private_data, struct snd_seq_port_subscribe *info) { int err; struct seq_midisynth *msynth = private_data; if (snd_BUG_ON(!msynth->input_rfile.input)) return -EINVAL; err = snd_rawmidi_kernel_release(&msynth->input_rfile); return err; } /* open associated midi device for output */ static int midisynth_use(void *private_data, struct snd_seq_port_subscribe *info) { int err; struct seq_midisynth *msynth = private_data; struct snd_rawmidi_params params; /* open midi port */ err = snd_rawmidi_kernel_open(msynth->rmidi, msynth->subdevice, SNDRV_RAWMIDI_LFLG_OUTPUT, &msynth->output_rfile); if (err < 0) { pr_debug("ALSA: seq_midi: midi output open failed!!!\n"); return err; } memset(¶ms, 0, sizeof(params)); params.avail_min = 1; params.buffer_size = output_buffer_size; params.no_active_sensing = 1; err = snd_rawmidi_output_params(msynth->output_rfile.output, ¶ms); if (err < 0) { snd_rawmidi_kernel_release(&msynth->output_rfile); return err; } snd_midi_event_reset_decode(msynth->parser); return 0; } /* close associated midi device for output */ static int midisynth_unuse(void *private_data, struct snd_seq_port_subscribe *info) { struct seq_midisynth *msynth = private_data; if (snd_BUG_ON(!msynth->output_rfile.output)) return -EINVAL; snd_rawmidi_drain_output(msynth->output_rfile.output); return snd_rawmidi_kernel_release(&msynth->output_rfile); } /* delete given midi synth port */ static void snd_seq_midisynth_delete(struct seq_midisynth *msynth) { if (msynth == NULL) return; if (msynth->seq_client > 0) { /* delete port */ snd_seq_event_port_detach(msynth->seq_client, msynth->seq_port); } snd_midi_event_free(msynth->parser); } /* register new midi synth port */ static int snd_seq_midisynth_probe(struct device *_dev) { struct snd_seq_device *dev = to_seq_dev(_dev); struct seq_midisynth_client *client; struct seq_midisynth *msynth, *ms; struct snd_seq_port_info *port __free(kfree) = NULL; struct snd_rawmidi_info *info __free(kfree) = NULL; struct snd_rawmidi *rmidi = dev->private_data; int newclient = 0; unsigned int p, ports; struct snd_seq_port_callback pcallbacks; struct snd_card *card = dev->card; int device = dev->device; unsigned int input_count = 0, output_count = 0; if (snd_BUG_ON(!card || device < 0 || device >= SNDRV_RAWMIDI_DEVICES)) return -EINVAL; info = kmalloc(sizeof(*info), GFP_KERNEL); if (! info) return -ENOMEM; info->device = device; info->stream = SNDRV_RAWMIDI_STREAM_OUTPUT; info->subdevice = 0; if (snd_rawmidi_info_select(card, info) >= 0) output_count = info->subdevices_count; info->stream = SNDRV_RAWMIDI_STREAM_INPUT; if (snd_rawmidi_info_select(card, info) >= 0) { input_count = info->subdevices_count; } ports = output_count; if (ports < input_count) ports = input_count; if (ports == 0) return -ENODEV; if (ports > (256 / SNDRV_RAWMIDI_DEVICES)) ports = 256 / SNDRV_RAWMIDI_DEVICES; guard(mutex)(®ister_mutex); client = synths[card->number]; if (client == NULL) { newclient = 1; client = kzalloc(sizeof(*client), GFP_KERNEL); if (client == NULL) return -ENOMEM; client->seq_client = snd_seq_create_kernel_client( card, 0, "%s", card->shortname[0] ? (const char *)card->shortname : "External MIDI"); if (client->seq_client < 0) { kfree(client); return -ENOMEM; } } msynth = kcalloc(ports, sizeof(struct seq_midisynth), GFP_KERNEL); port = kmalloc(sizeof(*port), GFP_KERNEL); if (msynth == NULL || port == NULL) goto __nomem; for (p = 0; p < ports; p++) { ms = &msynth[p]; ms->rmidi = rmidi; if (snd_seq_midisynth_new(ms, card, device, p) < 0) goto __nomem; /* declare port */ memset(port, 0, sizeof(*port)); port->addr.client = client->seq_client; port->addr.port = device * (256 / SNDRV_RAWMIDI_DEVICES) + p; port->flags = SNDRV_SEQ_PORT_FLG_GIVEN_PORT; memset(info, 0, sizeof(*info)); info->device = device; if (p < output_count) info->stream = SNDRV_RAWMIDI_STREAM_OUTPUT; else info->stream = SNDRV_RAWMIDI_STREAM_INPUT; info->subdevice = p; if (snd_rawmidi_info_select(card, info) >= 0) strcpy(port->name, info->subname); if (! port->name[0]) { if (info->name[0]) { if (ports > 1) scnprintf(port->name, sizeof(port->name), "%s-%u", info->name, p); else scnprintf(port->name, sizeof(port->name), "%s", info->name); } else { /* last resort */ if (ports > 1) sprintf(port->name, "MIDI %d-%d-%u", card->number, device, p); else sprintf(port->name, "MIDI %d-%d", card->number, device); } } if ((info->flags & SNDRV_RAWMIDI_INFO_OUTPUT) && p < output_count) port->capability |= SNDRV_SEQ_PORT_CAP_WRITE | SNDRV_SEQ_PORT_CAP_SYNC_WRITE | SNDRV_SEQ_PORT_CAP_SUBS_WRITE; if ((info->flags & SNDRV_RAWMIDI_INFO_INPUT) && p < input_count) port->capability |= SNDRV_SEQ_PORT_CAP_READ | SNDRV_SEQ_PORT_CAP_SYNC_READ | SNDRV_SEQ_PORT_CAP_SUBS_READ; if ((port->capability & (SNDRV_SEQ_PORT_CAP_WRITE|SNDRV_SEQ_PORT_CAP_READ)) == (SNDRV_SEQ_PORT_CAP_WRITE|SNDRV_SEQ_PORT_CAP_READ) && info->flags & SNDRV_RAWMIDI_INFO_DUPLEX) port->capability |= SNDRV_SEQ_PORT_CAP_DUPLEX; if (port->capability & SNDRV_SEQ_PORT_CAP_READ) port->direction |= SNDRV_SEQ_PORT_DIR_INPUT; if (port->capability & SNDRV_SEQ_PORT_CAP_WRITE) port->direction |= SNDRV_SEQ_PORT_DIR_OUTPUT; port->type = SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | SNDRV_SEQ_PORT_TYPE_HARDWARE | SNDRV_SEQ_PORT_TYPE_PORT; port->midi_channels = 16; memset(&pcallbacks, 0, sizeof(pcallbacks)); pcallbacks.owner = THIS_MODULE; pcallbacks.private_data = ms; pcallbacks.subscribe = midisynth_subscribe; pcallbacks.unsubscribe = midisynth_unsubscribe; pcallbacks.use = midisynth_use; pcallbacks.unuse = midisynth_unuse; pcallbacks.event_input = event_process_midi; port->kernel = &pcallbacks; if (rmidi->ops && rmidi->ops->get_port_info) rmidi->ops->get_port_info(rmidi, p, port); if (snd_seq_kernel_client_ctl(client->seq_client, SNDRV_SEQ_IOCTL_CREATE_PORT, port)<0) goto __nomem; ms->seq_client = client->seq_client; ms->seq_port = port->addr.port; } client->ports_per_device[device] = ports; client->ports[device] = msynth; client->num_ports++; if (newclient) synths[card->number] = client; return 0; /* success */ __nomem: if (msynth != NULL) { for (p = 0; p < ports; p++) snd_seq_midisynth_delete(&msynth[p]); kfree(msynth); } if (newclient) { snd_seq_delete_kernel_client(client->seq_client); kfree(client); } return -ENOMEM; } /* release midi synth port */ static int snd_seq_midisynth_remove(struct device *_dev) { struct snd_seq_device *dev = to_seq_dev(_dev); struct seq_midisynth_client *client; struct seq_midisynth *msynth; struct snd_card *card = dev->card; int device = dev->device, p, ports; guard(mutex)(®ister_mutex); client = synths[card->number]; if (client == NULL || client->ports[device] == NULL) return -ENODEV; ports = client->ports_per_device[device]; client->ports_per_device[device] = 0; msynth = client->ports[device]; client->ports[device] = NULL; for (p = 0; p < ports; p++) snd_seq_midisynth_delete(&msynth[p]); kfree(msynth); client->num_ports--; if (client->num_ports <= 0) { snd_seq_delete_kernel_client(client->seq_client); synths[card->number] = NULL; kfree(client); } return 0; } static struct snd_seq_driver seq_midisynth_driver = { .driver = { .name = KBUILD_MODNAME, .probe = snd_seq_midisynth_probe, .remove = snd_seq_midisynth_remove, }, .id = SNDRV_SEQ_DEV_ID_MIDISYNTH, .argsize = 0, }; module_snd_seq_driver(seq_midisynth_driver); |
5 2 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" #include "io_uring.h" #include "statx.h" struct io_statx { struct file *file; int dfd; unsigned int mask; unsigned int flags; struct filename *filename; struct statx __user *buffer; }; int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); const char __user *path; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; sx->dfd = READ_ONCE(sqe->fd); sx->mask = READ_ONCE(sqe->len); path = u64_to_user_ptr(READ_ONCE(sqe->addr)); sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); sx->flags = READ_ONCE(sqe->statx_flags); sx->filename = getname_flags(path, getname_statx_lookup_flags(sx->flags), NULL); if (IS_ERR(sx->filename)) { int ret = PTR_ERR(sx->filename); sx->filename = NULL; return ret; } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_statx(struct io_kiocb *req, unsigned int issue_flags) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); io_req_set_res(req, ret, 0); return IOU_OK; } void io_statx_cleanup(struct io_kiocb *req) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); if (sx->filename) putname(sx->filename); } |
21 31 1 21 29 6 24 24 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/textsearch.c Generic text search interface * * Authors: Thomas Graf <tgraf@suug.ch> * Pablo Neira Ayuso <pablo@netfilter.org> * * ========================================================================== */ /** * DOC: ts_intro * INTRODUCTION * * The textsearch infrastructure provides text searching facilities for * both linear and non-linear data. Individual search algorithms are * implemented in modules and chosen by the user. * * ARCHITECTURE * * .. code-block:: none * * User * +----------------+ * | finish()|<--------------(6)-----------------+ * |get_next_block()|<--------------(5)---------------+ | * | | Algorithm | | * | | +------------------------------+ * | | | init() find() destroy() | * | | +------------------------------+ * | | Core API ^ ^ ^ * | | +---------------+ (2) (4) (8) * | (1)|----->| prepare() |---+ | | * | (3)|----->| find()/next() |-----------+ | * | (7)|----->| destroy() |----------------------+ * +----------------+ +---------------+ * * (1) User configures a search by calling textsearch_prepare() specifying * the search parameters such as the pattern and algorithm name. * (2) Core requests the algorithm to allocate and initialize a search * configuration according to the specified parameters. * (3) User starts the search(es) by calling textsearch_find() or * textsearch_next() to fetch subsequent occurrences. A state variable * is provided to the algorithm to store persistent variables. * (4) Core eventually resets the search offset and forwards the find() * request to the algorithm. * (5) Algorithm calls get_next_block() provided by the user continuously * to fetch the data to be searched in block by block. * (6) Algorithm invokes finish() after the last call to get_next_block * to clean up any leftovers from get_next_block. (Optional) * (7) User destroys the configuration by calling textsearch_destroy(). * (8) Core notifies the algorithm to destroy algorithm specific * allocations. (Optional) * * USAGE * * Before a search can be performed, a configuration must be created * by calling textsearch_prepare() specifying the searching algorithm, * the pattern to look for and flags. As a flag, you can set TS_IGNORECASE * to perform case insensitive matching. But it might slow down * performance of algorithm, so you should use it at own your risk. * The returned configuration may then be used for an arbitrary * amount of times and even in parallel as long as a separate struct * ts_state variable is provided to every instance. * * The actual search is performed by either calling * textsearch_find_continuous() for linear data or by providing * an own get_next_block() implementation and * calling textsearch_find(). Both functions return * the position of the first occurrence of the pattern or UINT_MAX if * no match was found. Subsequent occurrences can be found by calling * textsearch_next() regardless of the linearity of the data. * * Once you're done using a configuration it must be given back via * textsearch_destroy. * * EXAMPLE:: * * int pos; * struct ts_config *conf; * struct ts_state state; * const char *pattern = "chicken"; * const char *example = "We dance the funky chicken"; * * conf = textsearch_prepare("kmp", pattern, strlen(pattern), * GFP_KERNEL, TS_AUTOLOAD); * if (IS_ERR(conf)) { * err = PTR_ERR(conf); * goto errout; * } * * pos = textsearch_find_continuous(conf, &state, example, strlen(example)); * if (pos != UINT_MAX) * panic("Oh my god, dancing chickens at %d\n", pos); * * textsearch_destroy(conf); */ /* ========================================================================== */ #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/err.h> #include <linux/textsearch.h> #include <linux/slab.h> static LIST_HEAD(ts_ops); static DEFINE_SPINLOCK(ts_mod_lock); static inline struct ts_ops *lookup_ts_algo(const char *name) { struct ts_ops *o; rcu_read_lock(); list_for_each_entry_rcu(o, &ts_ops, list) { if (!strcmp(name, o->name)) { if (!try_module_get(o->owner)) o = NULL; rcu_read_unlock(); return o; } } rcu_read_unlock(); return NULL; } /** * textsearch_register - register a textsearch module * @ops: operations lookup table * * This function must be called by textsearch modules to announce * their presence. The specified &@ops must have %name set to a * unique identifier and the callbacks find(), init(), get_pattern(), * and get_pattern_len() must be implemented. * * Returns 0 or -EEXISTS if another module has already registered * with same name. */ int textsearch_register(struct ts_ops *ops) { int err = -EEXIST; struct ts_ops *o; if (ops->name == NULL || ops->find == NULL || ops->init == NULL || ops->get_pattern == NULL || ops->get_pattern_len == NULL) return -EINVAL; spin_lock(&ts_mod_lock); list_for_each_entry(o, &ts_ops, list) { if (!strcmp(ops->name, o->name)) goto errout; } list_add_tail_rcu(&ops->list, &ts_ops); err = 0; errout: spin_unlock(&ts_mod_lock); return err; } EXPORT_SYMBOL(textsearch_register); /** * textsearch_unregister - unregister a textsearch module * @ops: operations lookup table * * This function must be called by textsearch modules to announce * their disappearance for examples when the module gets unloaded. * The &ops parameter must be the same as the one during the * registration. * * Returns 0 on success or -ENOENT if no matching textsearch * registration was found. */ int textsearch_unregister(struct ts_ops *ops) { int err = 0; struct ts_ops *o; spin_lock(&ts_mod_lock); list_for_each_entry(o, &ts_ops, list) { if (o == ops) { list_del_rcu(&o->list); goto out; } } err = -ENOENT; out: spin_unlock(&ts_mod_lock); return err; } EXPORT_SYMBOL(textsearch_unregister); struct ts_linear_state { unsigned int len; const void *data; }; static unsigned int get_linear_data(unsigned int consumed, const u8 **dst, struct ts_config *conf, struct ts_state *state) { struct ts_linear_state *st = (struct ts_linear_state *) state->cb; if (likely(consumed < st->len)) { *dst = st->data + consumed; return st->len - consumed; } return 0; } /** * textsearch_find_continuous - search a pattern in continuous/linear data * @conf: search configuration * @state: search state * @data: data to search in * @len: length of data * * A simplified version of textsearch_find() for continuous/linear data. * Call textsearch_next() to retrieve subsequent matches. * * Returns the position of first occurrence of the pattern or * %UINT_MAX if no occurrence was found. */ unsigned int textsearch_find_continuous(struct ts_config *conf, struct ts_state *state, const void *data, unsigned int len) { struct ts_linear_state *st = (struct ts_linear_state *) state->cb; conf->get_next_block = get_linear_data; st->data = data; st->len = len; return textsearch_find(conf, state); } EXPORT_SYMBOL(textsearch_find_continuous); /** * textsearch_prepare - Prepare a search * @algo: name of search algorithm * @pattern: pattern data * @len: length of pattern * @gfp_mask: allocation mask * @flags: search flags * * Looks up the search algorithm module and creates a new textsearch * configuration for the specified pattern. * * Note: The format of the pattern may not be compatible between * the various search algorithms. * * Returns a new textsearch configuration according to the specified * parameters or a ERR_PTR(). If a zero length pattern is passed, this * function returns EINVAL. */ struct ts_config *textsearch_prepare(const char *algo, const void *pattern, unsigned int len, gfp_t gfp_mask, int flags) { int err = -ENOENT; struct ts_config *conf; struct ts_ops *ops; if (len == 0) return ERR_PTR(-EINVAL); ops = lookup_ts_algo(algo); #ifdef CONFIG_MODULES /* * Why not always autoload you may ask. Some users are * in a situation where requesting a module may deadlock, * especially when the module is located on a NFS mount. */ if (ops == NULL && flags & TS_AUTOLOAD) { request_module("ts_%s", algo); ops = lookup_ts_algo(algo); } #endif if (ops == NULL) goto errout; conf = ops->init(pattern, len, gfp_mask, flags); if (IS_ERR(conf)) { err = PTR_ERR(conf); goto errout; } conf->ops = ops; return conf; errout: if (ops) module_put(ops->owner); return ERR_PTR(err); } EXPORT_SYMBOL(textsearch_prepare); /** * textsearch_destroy - destroy a search configuration * @conf: search configuration * * Releases all references of the configuration and frees * up the memory. */ void textsearch_destroy(struct ts_config *conf) { if (conf->ops) { if (conf->ops->destroy) conf->ops->destroy(conf); module_put(conf->ops->owner); } kfree(conf); } EXPORT_SYMBOL(textsearch_destroy); |
24 24 24 21 9 4 9 4 9 9 2 2 9 9 9 6 7 9 4 6 4 9 9 9 9 9 9 1 15 9 6 6 9 8 7 8 9 9 9 9 7 9 7 9 9 9 9 9 9 9 18 18 7 1 6 6 14 1 6 6 6 6 6 23 20 4 24 24 24 75 71 5 23 22 18 6 5 60 4 59 40 4 14 15 2 2 4 31 4 3 1 1 80 2 6 30 2 106 106 98 8 3 2 2 3 62 1 9 5 2 7 3 16 4 13 1 67 1 1 1 82 1 2 72 1 2 13 1 49 12 60 54 56 4 26 30 3 24 6 24 9 20 24 24 24 82 81 46 37 8 8 8 1 8 8 8 7 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 | /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/gfp.h> #include <net/sock.h> #include <linux/in.h> #include <linux/list.h> #include <linux/ratelimit.h> #include <linux/export.h> #include <linux/sizes.h> #include "rds.h" /* When transmitting messages in rds_send_xmit, we need to emerge from * time to time and briefly release the CPU. Otherwise the softlock watchdog * will kick our shin. * Also, it seems fairer to not let one busy connection stall all the * others. * * send_batch_count is the number of times we'll loop in send_xmit. Setting * it to 0 will restore the old behavior (where we looped until we had * drained the queue). */ static int send_batch_count = SZ_1K; module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); static void rds_send_remove_from_sock(struct list_head *messages, int status); /* * Reset the send state. Callers must ensure that this doesn't race with * rds_send_xmit(). */ void rds_send_path_reset(struct rds_conn_path *cp) { struct rds_message *rm, *tmp; unsigned long flags; if (cp->cp_xmit_rm) { rm = cp->cp_xmit_rm; cp->cp_xmit_rm = NULL; /* Tell the user the RDMA op is no longer mapped by the * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's * no ongoing RDMA to/from that memory */ rds_message_unmapped(rm); rds_message_put(rm); } cp->cp_xmit_sg = 0; cp->cp_xmit_hdr_off = 0; cp->cp_xmit_data_off = 0; cp->cp_xmit_atomic_sent = 0; cp->cp_xmit_rdma_sent = 0; cp->cp_xmit_data_sent = 0; cp->cp_conn->c_map_queued = 0; cp->cp_unacked_packets = rds_sysctl_max_unacked_packets; cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes; /* Mark messages as retransmissions, and move them to the send q */ spin_lock_irqsave(&cp->cp_lock, flags); list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); } list_splice_init(&cp->cp_retrans, &cp->cp_send_queue); spin_unlock_irqrestore(&cp->cp_lock, flags); } EXPORT_SYMBOL_GPL(rds_send_path_reset); static int acquire_in_xmit(struct rds_conn_path *cp) { return test_and_set_bit_lock(RDS_IN_XMIT, &cp->cp_flags) == 0; } static void release_in_xmit(struct rds_conn_path *cp) { clear_bit_unlock(RDS_IN_XMIT, &cp->cp_flags); /* * We don't use wait_on_bit()/wake_up_bit() because our waking is in a * hot path and finding waiters is very rare. We don't want to walk * the system-wide hashed waitqueue buckets in the fast path only to * almost never find waiters. */ if (waitqueue_active(&cp->cp_waitq)) wake_up_all(&cp->cp_waitq); } /* * We're making the conscious trade-off here to only send one message * down the connection at a time. * Pro: * - tx queueing is a simple fifo list * - reassembly is optional and easily done by transports per conn * - no per flow rx lookup at all, straight to the socket * - less per-frag memory and wire overhead * Con: * - queued acks can be delayed behind large messages * Depends: * - small message latency is higher behind queued large messages * - large message latency isn't starved by intervening small sends */ int rds_send_xmit(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; struct rds_message *rm; unsigned long flags; unsigned int tmp; struct scatterlist *sg; int ret = 0; LIST_HEAD(to_be_dropped); int batch_count; unsigned long send_gen = 0; int same_rm = 0; restart: batch_count = 0; /* * sendmsg calls here after having queued its message on the send * queue. We only have one task feeding the connection at a time. If * another thread is already feeding the queue then we back off. This * avoids blocking the caller and trading per-connection data between * caches per message. */ if (!acquire_in_xmit(cp)) { rds_stats_inc(s_send_lock_contention); ret = -ENOMEM; goto out; } if (rds_destroy_pending(cp->cp_conn)) { release_in_xmit(cp); ret = -ENETUNREACH; /* dont requeue send work */ goto out; } /* * we record the send generation after doing the xmit acquire. * if someone else manages to jump in and do some work, we'll use * this to avoid a goto restart farther down. * * The acquire_in_xmit() check above ensures that only one * caller can increment c_send_gen at any time. */ send_gen = READ_ONCE(cp->cp_send_gen) + 1; WRITE_ONCE(cp->cp_send_gen, send_gen); /* * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, * we do the opposite to avoid races. */ if (!rds_conn_path_up(cp)) { release_in_xmit(cp); ret = 0; goto out; } if (conn->c_trans->xmit_path_prepare) conn->c_trans->xmit_path_prepare(cp); /* * spin trying to push headers and data down the connection until * the connection doesn't make forward progress. */ while (1) { rm = cp->cp_xmit_rm; if (!rm) { same_rm = 0; } else { same_rm++; if (same_rm >= 4096) { rds_stats_inc(s_send_stuck_rm); ret = -EAGAIN; break; } } /* * If between sending messages, we can send a pending congestion * map update. */ if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { rm = rds_cong_update_alloc(conn); if (IS_ERR(rm)) { ret = PTR_ERR(rm); break; } rm->data.op_active = 1; rm->m_inc.i_conn_path = cp; rm->m_inc.i_conn = cp->cp_conn; cp->cp_xmit_rm = rm; } /* * If not already working on one, grab the next message. * * cp_xmit_rm holds a ref while we're sending this message down * the connction. We can use this ref while holding the * send_sem.. rds_send_reset() is serialized with it. */ if (!rm) { unsigned int len; batch_count++; /* we want to process as big a batch as we can, but * we also want to avoid softlockups. If we've been * through a lot of messages, lets back off and see * if anyone else jumps in */ if (batch_count >= send_batch_count) goto over_batch; spin_lock_irqsave(&cp->cp_lock, flags); if (!list_empty(&cp->cp_send_queue)) { rm = list_entry(cp->cp_send_queue.next, struct rds_message, m_conn_item); rds_message_addref(rm); /* * Move the message from the send queue to the retransmit * list right away. */ list_move_tail(&rm->m_conn_item, &cp->cp_retrans); } spin_unlock_irqrestore(&cp->cp_lock, flags); if (!rm) break; /* Unfortunately, the way Infiniband deals with * RDMA to a bad MR key is by moving the entire * queue pair to error state. We could possibly * recover from that, but right now we drop the * connection. * Therefore, we never retransmit messages with RDMA ops. */ if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) || (rm->rdma.op_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) { spin_lock_irqsave(&cp->cp_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); spin_unlock_irqrestore(&cp->cp_lock, flags); continue; } /* Require an ACK every once in a while */ len = ntohl(rm->m_inc.i_hdr.h_len); if (cp->cp_unacked_packets == 0 || cp->cp_unacked_bytes < len) { set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); cp->cp_unacked_packets = rds_sysctl_max_unacked_packets; cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes; rds_stats_inc(s_send_ack_required); } else { cp->cp_unacked_bytes -= len; cp->cp_unacked_packets--; } cp->cp_xmit_rm = rm; } /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) { rm->m_final_op = &rm->rdma; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); if (ret) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); wake_up_interruptible(&rm->m_flush_wait); break; } cp->cp_xmit_rdma_sent = 1; } if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) { rm->m_final_op = &rm->atomic; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); if (ret) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); wake_up_interruptible(&rm->m_flush_wait); break; } cp->cp_xmit_atomic_sent = 1; } /* * A number of cases require an RDS header to be sent * even if there is no data. * We permit 0-byte sends; rds-ping depends on this. * However, if there are exclusively attached silent ops, * we skip the hdr/data send, to enable silent operation. */ if (rm->data.op_nents == 0) { int ops_present; int all_ops_are_silent = 1; ops_present = (rm->atomic.op_active || rm->rdma.op_active); if (rm->atomic.op_active && !rm->atomic.op_silent) all_ops_are_silent = 0; if (rm->rdma.op_active && !rm->rdma.op_silent) all_ops_are_silent = 0; if (ops_present && all_ops_are_silent && !rm->m_rdma_cookie) rm->data.op_active = 0; } if (rm->data.op_active && !cp->cp_xmit_data_sent) { rm->m_final_op = &rm->data; ret = conn->c_trans->xmit(conn, rm, cp->cp_xmit_hdr_off, cp->cp_xmit_sg, cp->cp_xmit_data_off); if (ret <= 0) break; if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) { tmp = min_t(int, ret, sizeof(struct rds_header) - cp->cp_xmit_hdr_off); cp->cp_xmit_hdr_off += tmp; ret -= tmp; } sg = &rm->data.op_sg[cp->cp_xmit_sg]; while (ret) { tmp = min_t(int, ret, sg->length - cp->cp_xmit_data_off); cp->cp_xmit_data_off += tmp; ret -= tmp; if (cp->cp_xmit_data_off == sg->length) { cp->cp_xmit_data_off = 0; sg++; cp->cp_xmit_sg++; BUG_ON(ret != 0 && cp->cp_xmit_sg == rm->data.op_nents); } } if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) && (cp->cp_xmit_sg == rm->data.op_nents)) cp->cp_xmit_data_sent = 1; } /* * A rm will only take multiple times through this loop * if there is a data op. Thus, if the data is sent (or there was * none), then we're done with the rm. */ if (!rm->data.op_active || cp->cp_xmit_data_sent) { cp->cp_xmit_rm = NULL; cp->cp_xmit_sg = 0; cp->cp_xmit_hdr_off = 0; cp->cp_xmit_data_off = 0; cp->cp_xmit_rdma_sent = 0; cp->cp_xmit_atomic_sent = 0; cp->cp_xmit_data_sent = 0; rds_message_put(rm); } } over_batch: if (conn->c_trans->xmit_path_complete) conn->c_trans->xmit_path_complete(cp); release_in_xmit(cp); /* Nuke any messages we decided not to retransmit. */ if (!list_empty(&to_be_dropped)) { /* irqs on here, so we can put(), unlike above */ list_for_each_entry(rm, &to_be_dropped, m_conn_item) rds_message_put(rm); rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); } /* * Other senders can queue a message after we last test the send queue * but before we clear RDS_IN_XMIT. In that case they'd back off and * not try and send their newly queued message. We need to check the * send queue after having cleared RDS_IN_XMIT so that their message * doesn't get stuck on the send queue. * * If the transport cannot continue (i.e ret != 0), then it must * call us when more room is available, such as from the tx * completion handler. * * We have an extra generation check here so that if someone manages * to jump in after our release_in_xmit, we'll see that they have done * some work and we will skip our goto */ if (ret == 0) { bool raced; smp_mb(); raced = send_gen != READ_ONCE(cp->cp_send_gen); if ((test_bit(0, &conn->c_map_queued) || !list_empty(&cp->cp_send_queue)) && !raced) { if (batch_count < send_batch_count) goto restart; rcu_read_lock(); if (rds_destroy_pending(cp->cp_conn)) ret = -ENETUNREACH; else queue_delayed_work(rds_wq, &cp->cp_send_w, 1); rcu_read_unlock(); } else if (raced) { rds_stats_inc(s_send_lock_queue_raced); } } out: return ret; } EXPORT_SYMBOL_GPL(rds_send_xmit); static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) { u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len); assert_spin_locked(&rs->rs_lock); BUG_ON(rs->rs_snd_bytes < len); rs->rs_snd_bytes -= len; if (rs->rs_snd_bytes == 0) rds_stats_inc(s_send_queue_empty); } static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, is_acked_func is_acked) { if (is_acked) return is_acked(rm, ack); return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack; } /* * This is pretty similar to what happens below in the ACK * handling code - except that we call here as soon as we get * the IB send completion on the RDMA op and the accompanying * message. */ void rds_rdma_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; struct rm_rdma_op *ro; struct rds_notifier *notifier; unsigned long flags; spin_lock_irqsave(&rm->m_rs_lock, flags); ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && ro->op_active && ro->op_notify && ro->op_notifier) { notifier = ro->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); notifier->n_status = status; spin_lock(&rs->rs_lock); list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); ro->op_notifier = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } EXPORT_SYMBOL_GPL(rds_rdma_send_complete); /* * Just like above, except looks at atomic op */ void rds_atomic_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; struct rm_atomic_op *ao; struct rds_notifier *notifier; unsigned long flags; spin_lock_irqsave(&rm->m_rs_lock, flags); ao = &rm->atomic; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && ao->op_active && ao->op_notify && ao->op_notifier) { notifier = ao->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); notifier->n_status = status; spin_lock(&rs->rs_lock); list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); ao->op_notifier = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } EXPORT_SYMBOL_GPL(rds_atomic_send_complete); /* * This is the same as rds_rdma_send_complete except we * don't do any locking - we have all the ingredients (message, * socket, socket lock) and can just move the notifier. */ static inline void __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) { struct rm_rdma_op *ro; struct rm_atomic_op *ao; ro = &rm->rdma; if (ro->op_active && ro->op_notify && ro->op_notifier) { ro->op_notifier->n_status = status; list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); ro->op_notifier = NULL; } ao = &rm->atomic; if (ao->op_active && ao->op_notify && ao->op_notifier) { ao->op_notifier->n_status = status; list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); ao->op_notifier = NULL; } /* No need to wake the app - caller does this */ } /* * This removes messages from the socket's list if they're on it. The list * argument must be private to the caller, we must be able to modify it * without locks. The messages must have a reference held for their * position on the list. This function will drop that reference after * removing the messages from the 'messages' list regardless of if it found * the messages on the socket list or not. */ static void rds_send_remove_from_sock(struct list_head *messages, int status) { unsigned long flags; struct rds_sock *rs = NULL; struct rds_message *rm; while (!list_empty(messages)) { int was_on_sock = 0; rm = list_entry(messages->next, struct rds_message, m_conn_item); list_del_init(&rm->m_conn_item); /* * If we see this flag cleared then we're *sure* that someone * else beat us to removing it from the sock. If we race * with their flag update we'll get the lock and then really * see that the flag has been cleared. * * The message spinlock makes sure nobody clears rm->m_rs * while we're messing with it. It does not prevent the * message from being removed from the socket, though. */ spin_lock_irqsave(&rm->m_rs_lock, flags); if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) goto unlock_and_drop; if (rs != rm->m_rs) { if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } rs = rm->m_rs; if (rs) sock_hold(rds_rs_to_sk(rs)); } if (!rs) goto unlock_and_drop; spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { struct rm_rdma_op *ro = &rm->rdma; struct rds_notifier *notifier; list_del_init(&rm->m_sock_item); rds_send_sndbuf_remove(rs, rm); if (ro->op_active && ro->op_notifier && (ro->op_notify || (ro->op_recverr && status))) { notifier = ro->op_notifier; list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); if (!notifier->n_status) notifier->n_status = status; rm->rdma.op_notifier = NULL; } was_on_sock = 1; } spin_unlock(&rs->rs_lock); unlock_and_drop: spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); if (was_on_sock) rds_message_put(rm); } if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } /* * Transports call here when they've determined that the receiver queued * messages up to, and including, the given sequence number. Messages are * moved to the retrans queue when rds_send_xmit picks them off the send * queue. This means that in the TCP case, the message may not have been * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked * checks the RDS_MSG_HAS_ACK_SEQ bit. */ void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, is_acked_func is_acked) { struct rds_message *rm, *tmp; unsigned long flags; LIST_HEAD(list); spin_lock_irqsave(&cp->cp_lock, flags); list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { if (!rds_send_is_acked(rm, ack, is_acked)) break; list_move(&rm->m_conn_item, &list); clear_bit(RDS_MSG_ON_CONN, &rm->m_flags); } /* order flag updates with spin locks */ if (!list_empty(&list)) smp_mb__after_atomic(); spin_unlock_irqrestore(&cp->cp_lock, flags); /* now remove the messages from the sock list as needed */ rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); } EXPORT_SYMBOL_GPL(rds_send_path_drop_acked); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked) { WARN_ON(conn->c_trans->t_mp_capable); rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked); } EXPORT_SYMBOL_GPL(rds_send_drop_acked); void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; struct rds_conn_path *cp; unsigned long flags; LIST_HEAD(list); /* get all the messages we're dropping under the rs lock */ spin_lock_irqsave(&rs->rs_lock, flags); list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { if (dest && (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) || dest->sin6_port != rm->m_inc.i_hdr.h_dport)) continue; list_move(&rm->m_sock_item, &list); rds_send_sndbuf_remove(rs, rm); clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); } /* order flag updates with the rs lock */ smp_mb__after_atomic(); spin_unlock_irqrestore(&rs->rs_lock, flags); if (list_empty(&list)) return; /* Remove the messages from the conn */ list_for_each_entry(rm, &list, m_sock_item) { conn = rm->m_inc.i_conn; if (conn->c_trans->t_mp_capable) cp = rm->m_inc.i_conn_path; else cp = &conn->c_path[0]; spin_lock_irqsave(&cp->cp_lock, flags); /* * Maybe someone else beat us to removing rm from the conn. * If we race with their flag update we'll get the lock and * then really see that the flag has been cleared. */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { spin_unlock_irqrestore(&cp->cp_lock, flags); continue; } list_del_init(&rm->m_conn_item); spin_unlock_irqrestore(&cp->cp_lock, flags); /* * Couldn't grab m_rs_lock in top loop (lock ordering), * but we can now. */ spin_lock_irqsave(&rm->m_rs_lock, flags); spin_lock(&rs->rs_lock); __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); } rds_wake_sk_sleep(rs); while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); list_del_init(&rm->m_sock_item); rds_message_wait(rm); /* just in case the code above skipped this message * because RDS_MSG_ON_CONN wasn't set, run it again here * taking m_rs_lock is the only thing that keeps us * from racing with ack processing. */ spin_lock_irqsave(&rm->m_rs_lock, flags); spin_lock(&rs->rs_lock); __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); } } /* * we only want this to fire once so we use the callers 'queued'. It's * possible that another thread can race with us and remove the * message from the flow with RDS_CANCEL_SENT_TO. */ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, struct rds_conn_path *cp, struct rds_message *rm, __be16 sport, __be16 dport, int *queued) { unsigned long flags; u32 len; if (*queued) goto out; len = be32_to_cpu(rm->m_inc.i_hdr.h_len); /* this is the only place which holds both the socket's rs_lock * and the connection's c_lock */ spin_lock_irqsave(&rs->rs_lock, flags); /* * If there is a little space in sndbuf, we don't queue anything, * and userspace gets -EAGAIN. But poll() indicates there's send * room. This can lead to bad behavior (spinning) if snd_bytes isn't * freed up by incoming acks. So we check the *old* value of * rs_snd_bytes here to allow the last msg to exceed the buffer, * and poll() now knows no more data can be sent. */ if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) { rs->rs_snd_bytes += len; /* let recv side know we are close to send space exhaustion. * This is probably not the optimal way to do it, as this * means we set the flag on *all* messages as soon as our * throughput hits a certain threshold. */ if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2) set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); rds_message_addref(rm); sock_hold(rds_rs_to_sk(rs)); rm->m_rs = rs; /* The code ordering is a little weird, but we're trying to minimize the time we hold c_lock */ rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); rm->m_inc.i_conn = conn; rm->m_inc.i_conn_path = cp; rds_message_addref(rm); spin_lock(&cp->cp_lock); rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++); list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); spin_unlock(&cp->cp_lock); rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", rm, len, rs, rs->rs_snd_bytes, (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence)); *queued = 1; } spin_unlock_irqrestore(&rs->rs_lock, flags); out: return *queued; } /* * rds_message is getting to be quite complicated, and we'd like to allocate * it all in one go. This figures out how big it needs to be up front. */ static int rds_rm_size(struct msghdr *msg, int num_sgs, struct rds_iov_vector_arr *vct) { struct cmsghdr *cmsg; int size = 0; int cmsg_groups = 0; int retval; bool zcopy_cookie = false; struct rds_iov_vector *iov, *tmp_iov; if (num_sgs < 0) return -EINVAL; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: if (vct->indx >= vct->len) { vct->len += vct->incr; tmp_iov = krealloc(vct->vec, vct->len * sizeof(struct rds_iov_vector), GFP_KERNEL); if (!tmp_iov) { vct->len -= vct->incr; return -ENOMEM; } vct->vec = tmp_iov; } iov = &vct->vec[vct->indx]; memset(iov, 0, sizeof(struct rds_iov_vector)); vct->indx++; cmsg_groups |= 1; retval = rds_rdma_extra_size(CMSG_DATA(cmsg), iov); if (retval < 0) return retval; size += retval; break; case RDS_CMSG_ZCOPY_COOKIE: zcopy_cookie = true; fallthrough; case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: cmsg_groups |= 2; /* these are valid but do no add any size */ break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: case RDS_CMSG_MASKED_ATOMIC_CSWP: case RDS_CMSG_MASKED_ATOMIC_FADD: cmsg_groups |= 1; size += sizeof(struct scatterlist); break; default: return -EINVAL; } } if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie) return -EINVAL; size += num_sgs * sizeof(struct scatterlist); /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ if (cmsg_groups == 3) return -EINVAL; return size; } static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { u32 *cookie; if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) || !rm->data.op_mmp_znotifier) return -EINVAL; cookie = CMSG_DATA(cmsg); rm->data.op_mmp_znotifier->z_cookie = *cookie; return 0; } static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr, struct rds_iov_vector_arr *vct) { struct cmsghdr *cmsg; int ret = 0, ind = 0; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; /* As a side effect, RDMA_DEST and RDMA_MAP will set * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. */ switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: if (ind >= vct->indx) return -ENOMEM; ret = rds_cmsg_rdma_args(rs, rm, cmsg, &vct->vec[ind]); ind++; break; case RDS_CMSG_RDMA_DEST: ret = rds_cmsg_rdma_dest(rs, rm, cmsg); break; case RDS_CMSG_RDMA_MAP: ret = rds_cmsg_rdma_map(rs, rm, cmsg); if (!ret) *allocated_mr = 1; else if (ret == -ENODEV) /* Accommodate the get_mr() case which can fail * if connection isn't established yet. */ ret = -EAGAIN; break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: case RDS_CMSG_MASKED_ATOMIC_CSWP: case RDS_CMSG_MASKED_ATOMIC_FADD: ret = rds_cmsg_atomic(rs, rm, cmsg); break; case RDS_CMSG_ZCOPY_COOKIE: ret = rds_cmsg_zcopy(rs, rm, cmsg); break; default: return -EINVAL; } if (ret) break; } return ret; } static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn, int nonblock) { int hash; if (conn->c_npaths == 0) hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS); else hash = RDS_MPATH_HASH(rs, conn->c_npaths); if (conn->c_npaths == 0 && hash != 0) { rds_send_ping(conn, 0); /* The underlying connection is not up yet. Need to wait * until it is up to be sure that the non-zero c_path can be * used. But if we are interrupted, we have to use the zero * c_path in case the connection ends up being non-MP capable. */ if (conn->c_npaths == 0) { /* Cannot wait for the connection be made, so just use * the base c_path. */ if (nonblock) return 0; if (wait_event_interruptible(conn->c_hs_waitq, conn->c_npaths != 0)) hash = 0; } if (conn->c_npaths == 1) hash = 0; } return hash; } static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) { struct rds_rdma_args *args; struct cmsghdr *cmsg; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))) return -EINVAL; args = CMSG_DATA(cmsg); *rdma_bytes += args->remote_vec.bytes; } } return 0; } int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); __be16 dport; struct rds_message *rm = NULL; struct rds_connection *conn; int ret = 0; int queued = 0, allocated_mr = 0; int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; struct in6_addr daddr; __u32 scope_id = 0; size_t rdma_payload_len = 0; bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); int num_sgs = DIV_ROUND_UP(payload_len, PAGE_SIZE); int namelen; struct rds_iov_vector_arr vct; int ind; memset(&vct, 0, sizeof(vct)); /* expect 1 RDMA CMSG per rds_sendmsg. can still grow if more needed. */ vct.incr = 1; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) { ret = -EOPNOTSUPP; goto out; } namelen = msg->msg_namelen; if (namelen != 0) { if (namelen < sizeof(*usin)) { ret = -EINVAL; goto out; } switch (usin->sin_family) { case AF_INET: if (usin->sin_addr.s_addr == htonl(INADDR_ANY) || usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || ipv4_is_multicast(usin->sin_addr.s_addr)) { ret = -EINVAL; goto out; } ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr); dport = usin->sin_port; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { int addr_type; if (namelen < sizeof(*sin6)) { ret = -EINVAL; goto out; } addr_type = ipv6_addr_type(&sin6->sin6_addr); if (!(addr_type & IPV6_ADDR_UNICAST)) { __be32 addr4; if (!(addr_type & IPV6_ADDR_MAPPED)) { ret = -EINVAL; goto out; } /* It is a mapped address. Need to do some * sanity checks. */ addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || ipv4_is_multicast(addr4)) { ret = -EINVAL; goto out; } } if (addr_type & IPV6_ADDR_LINKLOCAL) { if (sin6->sin6_scope_id == 0) { ret = -EINVAL; goto out; } scope_id = sin6->sin6_scope_id; } daddr = sin6->sin6_addr; dport = sin6->sin6_port; break; } #endif default: ret = -EINVAL; goto out; } } else { /* We only care about consistency with ->connect() */ lock_sock(sk); daddr = rs->rs_conn_addr; dport = rs->rs_conn_port; scope_id = rs->rs_bound_scope_id; release_sock(sk); } lock_sock(sk); if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) { release_sock(sk); ret = -ENOTCONN; goto out; } else if (namelen != 0) { /* Cannot send to an IPv4 address using an IPv6 source * address and cannot send to an IPv6 address using an * IPv4 source address. */ if (ipv6_addr_v4mapped(&daddr) ^ ipv6_addr_v4mapped(&rs->rs_bound_addr)) { release_sock(sk); ret = -EOPNOTSUPP; goto out; } /* If the socket is already bound to a link local address, * it can only send to peers on the same link. But allow * communicating between link local and non-link local address. */ if (scope_id != rs->rs_bound_scope_id) { if (!scope_id) { scope_id = rs->rs_bound_scope_id; } else if (rs->rs_bound_scope_id) { release_sock(sk); ret = -EINVAL; goto out; } } } release_sock(sk); ret = rds_rdma_bytes(msg, &rdma_payload_len); if (ret) goto out; if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) { ret = -EMSGSIZE; goto out; } if (payload_len > rds_sk_sndbuf(rs)) { ret = -EMSGSIZE; goto out; } if (zcopy) { if (rs->rs_transport->t_type != RDS_TRANS_TCP) { ret = -EOPNOTSUPP; goto out; } num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); } /* size of rm including all sgs */ ret = rds_rm_size(msg, num_sgs, &vct); if (ret < 0) goto out; rm = rds_message_alloc(ret, GFP_KERNEL); if (!rm) { ret = -ENOMEM; goto out; } /* Attach data to the rm */ if (payload_len) { rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); if (IS_ERR(rm->data.op_sg)) { ret = PTR_ERR(rm->data.op_sg); goto out; } ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy); if (ret) goto out; } rm->data.op_active = 1; rm->m_daddr = daddr; /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) && rs->rs_tos == rs->rs_conn->c_tos) { conn = rs->rs_conn; } else { conn = rds_conn_create_outgoing(sock_net(sock->sk), &rs->rs_bound_addr, &daddr, rs->rs_transport, rs->rs_tos, sock->sk->sk_allocation, scope_id); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; } rs->rs_conn = conn; } if (conn->c_trans->t_mp_capable) cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)]; else cpath = &conn->c_path[0]; rm->m_conn_path = cpath; /* Parse any control messages the user may have included. */ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct); if (ret) goto out; if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", &rm->rdma, conn->c_trans->xmit_rdma); ret = -EOPNOTSUPP; goto out; } if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", &rm->atomic, conn->c_trans->xmit_atomic); ret = -EOPNOTSUPP; goto out; } if (rds_destroy_pending(conn)) { ret = -EAGAIN; goto out; } if (rds_conn_path_down(cpath)) rds_check_all_paths(conn); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); if (ret) { rs->rs_seen_congestion = 1; goto out; } while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued)) { rds_stats_inc(s_send_queue_full); if (nonblock) { ret = -EAGAIN; goto out; } timeo = wait_event_interruptible_timeout(*sk_sleep(sk), rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued), timeo); rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo); if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) continue; ret = timeo; if (ret == 0) ret = -ETIMEDOUT; goto out; } /* * By now we've committed to the send. We reuse rds_send_worker() * to retry sends in the rds thread if the transport asks us to. */ rds_stats_inc(s_send_queued); ret = rds_send_xmit(cpath); if (ret == -ENOMEM || ret == -EAGAIN) { ret = 0; rcu_read_lock(); if (rds_destroy_pending(cpath->cp_conn)) ret = -ENETUNREACH; else queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); rcu_read_unlock(); } if (ret) goto out; rds_message_put(rm); for (ind = 0; ind < vct.indx; ind++) kfree(vct.vec[ind].iov); kfree(vct.vec); return payload_len; out: for (ind = 0; ind < vct.indx; ind++) kfree(vct.vec[ind].iov); kfree(vct.vec); /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN * or in any other way, we need to destroy the MR again */ if (allocated_mr) rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1); if (rm) rds_message_put(rm); return ret; } /* * send out a probe. Can be shared by rds_send_ping, * rds_send_pong, rds_send_hb. * rds_send_hb should use h_flags * RDS_FLAG_HB_PING|RDS_FLAG_ACK_REQUIRED * or * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED */ static int rds_send_probe(struct rds_conn_path *cp, __be16 sport, __be16 dport, u8 h_flags) { struct rds_message *rm; unsigned long flags; int ret = 0; rm = rds_message_alloc(0, GFP_ATOMIC); if (!rm) { ret = -ENOMEM; goto out; } rm->m_daddr = cp->cp_conn->c_faddr; rm->data.op_active = 1; rds_conn_path_connect_if_down(cp); ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL); if (ret) goto out; spin_lock_irqsave(&cp->cp_lock, flags); list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); rds_message_addref(rm); rm->m_inc.i_conn = cp->cp_conn; rm->m_inc.i_conn_path = cp; rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, cp->cp_next_tx_seq); rm->m_inc.i_hdr.h_flags |= h_flags; cp->cp_next_tx_seq++; if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) && cp->cp_conn->c_trans->t_mp_capable) { u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_NPATHS, &npaths, sizeof(npaths)); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_GEN_NUM, &my_gen_num, sizeof(u32)); } spin_unlock_irqrestore(&cp->cp_lock, flags); rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); /* schedule the send work on rds_wq */ rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_send_w, 1); rcu_read_unlock(); rds_message_put(rm); return 0; out: if (rm) rds_message_put(rm); return ret; } int rds_send_pong(struct rds_conn_path *cp, __be16 dport) { return rds_send_probe(cp, 0, dport, 0); } void rds_send_ping(struct rds_connection *conn, int cp_index) { unsigned long flags; struct rds_conn_path *cp = &conn->c_path[cp_index]; spin_lock_irqsave(&cp->cp_lock, flags); if (conn->c_ping_triggered) { spin_unlock_irqrestore(&cp->cp_lock, flags); return; } conn->c_ping_triggered = 1; spin_unlock_irqrestore(&cp->cp_lock, flags); rds_send_probe(cp, cpu_to_be16(RDS_FLAG_PROBE_PORT), 0, 0); } EXPORT_SYMBOL_GPL(rds_send_ping); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Key-agreement Protocol Primitives (KPP) * * Copyright (c) 2016, Intel Corporation * Authors: Salvatore Benedetto <salvatore.benedetto@intel.com> */ #ifndef _CRYPTO_KPP_ #define _CRYPTO_KPP_ #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/crypto.h> #include <linux/slab.h> /** * struct kpp_request * * @base: Common attributes for async crypto requests * @src: Source data * @dst: Destination data * @src_len: Size of the input buffer * @dst_len: Size of the output buffer. It needs to be at least * as big as the expected result depending on the operation * After operation it will be updated with the actual size of the * result. In case of error where the dst sgl size was insufficient, * it will be updated to the size required for the operation. * @__ctx: Start of private context data */ struct kpp_request { struct crypto_async_request base; struct scatterlist *src; struct scatterlist *dst; unsigned int src_len; unsigned int dst_len; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; /** * struct crypto_kpp - user-instantiated object which encapsulate * algorithms and core processing logic * * @reqsize: Request context size required by algorithm * implementation * @base: Common crypto API algorithm data structure */ struct crypto_kpp { unsigned int reqsize; struct crypto_tfm base; }; /** * struct kpp_alg - generic key-agreement protocol primitives * * @set_secret: Function invokes the protocol specific function to * store the secret private key along with parameters. * The implementation knows how to decode the buffer * @generate_public_key: Function generate the public key to be sent to the * counterpart. In case of error, where output is not big * enough req->dst_len will be updated to the size * required * @compute_shared_secret: Function compute the shared secret as defined by * the algorithm. The result is given back to the user. * In case of error, where output is not big enough, * req->dst_len will be updated to the size required * @max_size: Function returns the size of the output buffer * @init: Initialize the object. This is called only once at * instantiation time. In case the cryptographic hardware * needs to be initialized. Software fallback should be * put in place here. * @exit: Undo everything @init did. * * @base: Common crypto API algorithm data structure */ struct kpp_alg { int (*set_secret)(struct crypto_kpp *tfm, const void *buffer, unsigned int len); int (*generate_public_key)(struct kpp_request *req); int (*compute_shared_secret)(struct kpp_request *req); unsigned int (*max_size)(struct crypto_kpp *tfm); int (*init)(struct crypto_kpp *tfm); void (*exit)(struct crypto_kpp *tfm); struct crypto_alg base; }; /** * DOC: Generic Key-agreement Protocol Primitives API * * The KPP API is used with the algorithm type * CRYPTO_ALG_TYPE_KPP (listed as type "kpp" in /proc/crypto) */ /** * crypto_alloc_kpp() - allocate KPP tfm handle * @alg_name: is the name of the kpp algorithm (e.g. "dh", "ecdh") * @type: specifies the type of the algorithm * @mask: specifies the mask for the algorithm * * Allocate a handle for kpp algorithm. The returned struct crypto_kpp * is required for any following API invocation * * Return: allocated handle in case of success; IS_ERR() is true in case of * an error, PTR_ERR() returns the error code. */ struct crypto_kpp *crypto_alloc_kpp(const char *alg_name, u32 type, u32 mask); int crypto_has_kpp(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_kpp_tfm(struct crypto_kpp *tfm) { return &tfm->base; } static inline struct kpp_alg *__crypto_kpp_alg(struct crypto_alg *alg) { return container_of(alg, struct kpp_alg, base); } static inline struct crypto_kpp *__crypto_kpp_tfm(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_kpp, base); } static inline struct kpp_alg *crypto_kpp_alg(struct crypto_kpp *tfm) { return __crypto_kpp_alg(crypto_kpp_tfm(tfm)->__crt_alg); } static inline unsigned int crypto_kpp_reqsize(struct crypto_kpp *tfm) { return tfm->reqsize; } static inline void kpp_request_set_tfm(struct kpp_request *req, struct crypto_kpp *tfm) { req->base.tfm = crypto_kpp_tfm(tfm); } static inline struct crypto_kpp *crypto_kpp_reqtfm(struct kpp_request *req) { return __crypto_kpp_tfm(req->base.tfm); } static inline u32 crypto_kpp_get_flags(struct crypto_kpp *tfm) { return crypto_tfm_get_flags(crypto_kpp_tfm(tfm)); } static inline void crypto_kpp_set_flags(struct crypto_kpp *tfm, u32 flags) { crypto_tfm_set_flags(crypto_kpp_tfm(tfm), flags); } /** * crypto_free_kpp() - free KPP tfm handle * * @tfm: KPP tfm handle allocated with crypto_alloc_kpp() * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_kpp(struct crypto_kpp *tfm) { crypto_destroy_tfm(tfm, crypto_kpp_tfm(tfm)); } /** * kpp_request_alloc() - allocates kpp request * * @tfm: KPP tfm handle allocated with crypto_alloc_kpp() * @gfp: allocation flags * * Return: allocated handle in case of success or NULL in case of an error. */ static inline struct kpp_request *kpp_request_alloc(struct crypto_kpp *tfm, gfp_t gfp) { struct kpp_request *req; req = kmalloc(sizeof(*req) + crypto_kpp_reqsize(tfm), gfp); if (likely(req)) kpp_request_set_tfm(req, tfm); return req; } /** * kpp_request_free() - zeroize and free kpp request * * @req: request to free */ static inline void kpp_request_free(struct kpp_request *req) { kfree_sensitive(req); } /** * kpp_request_set_callback() - Sets an asynchronous callback. * * Callback will be called when an asynchronous operation on a given * request is finished. * * @req: request that the callback will be set for * @flgs: specify for instance if the operation may backlog * @cmpl: callback which will be called * @data: private data used by the caller */ static inline void kpp_request_set_callback(struct kpp_request *req, u32 flgs, crypto_completion_t cmpl, void *data) { req->base.complete = cmpl; req->base.data = data; req->base.flags = flgs; } /** * kpp_request_set_input() - Sets input buffer * * Sets parameters required by generate_public_key * * @req: kpp request * @input: ptr to input scatter list * @input_len: size of the input scatter list */ static inline void kpp_request_set_input(struct kpp_request *req, struct scatterlist *input, unsigned int input_len) { req->src = input; req->src_len = input_len; } /** * kpp_request_set_output() - Sets output buffer * * Sets parameters required by kpp operation * * @req: kpp request * @output: ptr to output scatter list * @output_len: size of the output scatter list */ static inline void kpp_request_set_output(struct kpp_request *req, struct scatterlist *output, unsigned int output_len) { req->dst = output; req->dst_len = output_len; } enum { CRYPTO_KPP_SECRET_TYPE_UNKNOWN, CRYPTO_KPP_SECRET_TYPE_DH, CRYPTO_KPP_SECRET_TYPE_ECDH, }; /** * struct kpp_secret - small header for packing secret buffer * * @type: define type of secret. Each kpp type will define its own * @len: specify the len of the secret, include the header, that * follows the struct */ struct kpp_secret { unsigned short type; unsigned short len; }; /** * crypto_kpp_set_secret() - Invoke kpp operation * * Function invokes the specific kpp operation for a given alg. * * @tfm: tfm handle * @buffer: Buffer holding the packet representation of the private * key. The structure of the packet key depends on the particular * KPP implementation. Packing and unpacking helpers are provided * for ECDH and DH (see the respective header files for those * implementations). * @len: Length of the packet private key buffer. * * Return: zero on success; error code in case of error */ static inline int crypto_kpp_set_secret(struct crypto_kpp *tfm, const void *buffer, unsigned int len) { return crypto_kpp_alg(tfm)->set_secret(tfm, buffer, len); } /** * crypto_kpp_generate_public_key() - Invoke kpp operation * * Function invokes the specific kpp operation for generating the public part * for a given kpp algorithm. * * To generate a private key, the caller should use a random number generator. * The output of the requested length serves as the private key. * * @req: kpp key request * * Return: zero on success; error code in case of error */ static inline int crypto_kpp_generate_public_key(struct kpp_request *req) { struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); return crypto_kpp_alg(tfm)->generate_public_key(req); } /** * crypto_kpp_compute_shared_secret() - Invoke kpp operation * * Function invokes the specific kpp operation for computing the shared secret * for a given kpp algorithm. * * @req: kpp key request * * Return: zero on success; error code in case of error */ static inline int crypto_kpp_compute_shared_secret(struct kpp_request *req) { struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); return crypto_kpp_alg(tfm)->compute_shared_secret(req); } /** * crypto_kpp_maxsize() - Get len for output buffer * * Function returns the output buffer size required for a given key. * Function assumes that the key is already set in the transformation. If this * function is called without a setkey or with a failed setkey, you will end up * in a NULL dereference. * * @tfm: KPP tfm handle allocated with crypto_alloc_kpp() */ static inline unsigned int crypto_kpp_maxsize(struct crypto_kpp *tfm) { struct kpp_alg *alg = crypto_kpp_alg(tfm); return alg->max_size(tfm); } #endif |
7 26 31 36 1 35 33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 | // SPDX-License-Identifier: GPL-2.0-or-later /* Key garbage collector * * Copyright (C) 2009-2011 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/security.h> #include <keys/keyring-type.h> #include "internal.h" /* * Delay between key revocation/expiry in seconds */ unsigned key_gc_delay = 5 * 60; /* * Reaper for unused keys. */ static void key_garbage_collector(struct work_struct *work); DECLARE_WORK(key_gc_work, key_garbage_collector); /* * Reaper for links from keyrings to dead keys. */ static void key_gc_timer_func(struct timer_list *); static DEFINE_TIMER(key_gc_timer, key_gc_timer_func); static time64_t key_gc_next_run = TIME64_MAX; static struct key_type *key_gc_dead_keytype; static unsigned long key_gc_flags; #define KEY_GC_KEY_EXPIRED 0 /* A key expired and needs unlinking */ #define KEY_GC_REAP_KEYTYPE 1 /* A keytype is being unregistered */ #define KEY_GC_REAPING_KEYTYPE 2 /* Cleared when keytype reaped */ /* * Any key whose type gets unregistered will be re-typed to this if it can't be * immediately unlinked. */ struct key_type key_type_dead = { .name = ".dead", }; /* * Schedule a garbage collection run. * - time precision isn't particularly important */ void key_schedule_gc(time64_t gc_at) { unsigned long expires; time64_t now = ktime_get_real_seconds(); kenter("%lld", gc_at - now); if (gc_at <= now || test_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) { kdebug("IMMEDIATE"); schedule_work(&key_gc_work); } else if (gc_at < key_gc_next_run) { kdebug("DEFERRED"); key_gc_next_run = gc_at; expires = jiffies + (gc_at - now) * HZ; mod_timer(&key_gc_timer, expires); } } /* * Set the expiration time on a key. */ void key_set_expiry(struct key *key, time64_t expiry) { key->expiry = expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; key_schedule_gc(expiry); } } /* * Schedule a dead links collection run. */ void key_schedule_gc_links(void) { set_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags); schedule_work(&key_gc_work); } /* * Some key's cleanup time was met after it expired, so we need to get the * reaper to go through a cycle finding expired keys. */ static void key_gc_timer_func(struct timer_list *unused) { kenter(""); key_gc_next_run = TIME64_MAX; key_schedule_gc_links(); } /* * Reap keys of dead type. * * We use three flags to make sure we see three complete cycles of the garbage * collector: the first to mark keys of that type as being dead, the second to * collect dead links and the third to clean up the dead keys. We have to be * careful as there may already be a cycle in progress. * * The caller must be holding key_types_sem. */ void key_gc_keytype(struct key_type *ktype) { kenter("%s", ktype->name); key_gc_dead_keytype = ktype; set_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags); smp_mb(); set_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags); kdebug("schedule"); schedule_work(&key_gc_work); kdebug("sleep"); wait_on_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE, TASK_UNINTERRUPTIBLE); key_gc_dead_keytype = NULL; kleave(""); } /* * Garbage collect a list of unreferenced, detached keys */ static noinline void key_gc_unused_keys(struct list_head *keys) { while (!list_empty(keys)) { struct key *key = list_entry(keys->next, struct key, graveyard_link); short state = key->state; list_del(&key->graveyard_link); kdebug("- %u", key->serial); key_check(key); #ifdef CONFIG_KEY_NOTIFICATIONS remove_watch_list(key->watchers, key->serial); key->watchers = NULL; #endif /* Throw away the key data if the key is instantiated */ if (state == KEY_IS_POSITIVE && key->type->destroy) key->type->destroy(key); security_key_free(key); atomic_dec(&key->user->nkeys); if (state != KEY_IS_UNINSTANTIATED) atomic_dec(&key->user->nikeys); key_user_put(key->user); key_put_tag(key->domain_tag); kfree(key->description); memzero_explicit(key, sizeof(*key)); kmem_cache_free(key_jar, key); } } /* * Garbage collector for unused keys. * * This is done in process context so that we don't have to disable interrupts * all over the place. key_put() schedules this rather than trying to do the * cleanup itself, which means key_put() doesn't have to sleep. */ static void key_garbage_collector(struct work_struct *work) { static LIST_HEAD(graveyard); static u8 gc_state; /* Internal persistent state */ #define KEY_GC_REAP_AGAIN 0x01 /* - Need another cycle */ #define KEY_GC_REAPING_LINKS 0x02 /* - We need to reap links */ #define KEY_GC_REAPING_DEAD_1 0x10 /* - We need to mark dead keys */ #define KEY_GC_REAPING_DEAD_2 0x20 /* - We need to reap dead key links */ #define KEY_GC_REAPING_DEAD_3 0x40 /* - We need to reap dead keys */ #define KEY_GC_FOUND_DEAD_KEY 0x80 /* - We found at least one dead key */ struct rb_node *cursor; struct key *key; time64_t new_timer, limit, expiry; kenter("[%lx,%x]", key_gc_flags, gc_state); limit = ktime_get_real_seconds(); /* Work out what we're going to be doing in this pass */ gc_state &= KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2; gc_state <<= 1; if (test_and_clear_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags)) gc_state |= KEY_GC_REAPING_LINKS; if (test_and_clear_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) gc_state |= KEY_GC_REAPING_DEAD_1; kdebug("new pass %x", gc_state); new_timer = TIME64_MAX; /* As only this function is permitted to remove things from the key * serial tree, if cursor is non-NULL then it will always point to a * valid node in the tree - even if lock got dropped. */ spin_lock(&key_serial_lock); cursor = rb_first(&key_serial_tree); continue_scanning: while (cursor) { key = rb_entry(cursor, struct key, serial_node); cursor = rb_next(cursor); if (refcount_read(&key->usage) == 0) goto found_unreferenced_key; if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) { if (key->type == key_gc_dead_keytype) { gc_state |= KEY_GC_FOUND_DEAD_KEY; set_bit(KEY_FLAG_DEAD, &key->flags); key->perm = 0; goto skip_dead_key; } else if (key->type == &key_type_keyring && key->restrict_link) { goto found_restricted_keyring; } } expiry = key->expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; if (expiry > limit && expiry < new_timer) { kdebug("will expire %x in %lld", key_serial(key), key->expiry - limit); new_timer = key->expiry; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) if (key->type == key_gc_dead_keytype) gc_state |= KEY_GC_FOUND_DEAD_KEY; if ((gc_state & KEY_GC_REAPING_LINKS) || unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) { if (key->type == &key_type_keyring) goto found_keyring; } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) if (key->type == key_gc_dead_keytype) goto destroy_dead_key; skip_dead_key: if (spin_is_contended(&key_serial_lock) || need_resched()) goto contended; } contended: spin_unlock(&key_serial_lock); maybe_resched: if (cursor) { cond_resched(); spin_lock(&key_serial_lock); goto continue_scanning; } /* We've completed the pass. Set the timer if we need to and queue a * new cycle if necessary. We keep executing cycles until we find one * where we didn't reap any keys. */ kdebug("pass complete"); if (new_timer != TIME64_MAX) { new_timer += key_gc_delay; key_schedule_gc(new_timer); } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2) || !list_empty(&graveyard)) { /* Make sure that all pending keyring payload destructions are * fulfilled and that people aren't now looking at dead or * dying keys that they don't have a reference upon or a link * to. */ kdebug("gc sync"); synchronize_rcu(); } if (!list_empty(&graveyard)) { kdebug("gc keys"); key_gc_unused_keys(&graveyard); } if (unlikely(gc_state & (KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2))) { if (!(gc_state & KEY_GC_FOUND_DEAD_KEY)) { /* No remaining dead keys: short circuit the remaining * keytype reap cycles. */ kdebug("dead short"); gc_state &= ~(KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2); gc_state |= KEY_GC_REAPING_DEAD_3; } else { gc_state |= KEY_GC_REAP_AGAIN; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) { kdebug("dead wake"); smp_mb(); clear_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags); wake_up_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE); } if (gc_state & KEY_GC_REAP_AGAIN) schedule_work(&key_gc_work); kleave(" [end %x]", gc_state); return; /* We found an unreferenced key - once we've removed it from the tree, * we can safely drop the lock. */ found_unreferenced_key: kdebug("unrefd key %d", key->serial); rb_erase(&key->serial_node, &key_serial_tree); spin_unlock(&key_serial_lock); list_add_tail(&key->graveyard_link, &graveyard); gc_state |= KEY_GC_REAP_AGAIN; goto maybe_resched; /* We found a restricted keyring and need to update the restriction if * it is associated with the dead key type. */ found_restricted_keyring: spin_unlock(&key_serial_lock); keyring_restriction_gc(key, key_gc_dead_keytype); goto maybe_resched; /* We found a keyring and we need to check the payload for links to * dead or expired keys. We don't flag another reap immediately as we * have to wait for the old payload to be destroyed by RCU before we * can reap the keys to which it refers. */ found_keyring: spin_unlock(&key_serial_lock); keyring_gc(key, limit); goto maybe_resched; /* We found a dead key that is still referenced. Reset its type and * destroy its payload with its semaphore held. */ destroy_dead_key: spin_unlock(&key_serial_lock); kdebug("destroy key %d", key->serial); down_write(&key->sem); key->type = &key_type_dead; if (key_gc_dead_keytype->destroy) key_gc_dead_keytype->destroy(key); memset(&key->payload, KEY_DESTROY, sizeof(key->payload)); up_write(&key->sem); goto maybe_resched; } |
102 4 102 101 15 15 162 162 102 102 23 93 93 157 4 153 167 2 179 3 152 151 5 5 61 1 57 3 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 | /* * PCM Interface - misc routines * Copyright (c) 1998 by Jaroslav Kysela <perex@perex.cz> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <linux/export.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_local.h" #define SND_PCM_FORMAT_UNKNOWN (-1) /* NOTE: "signed" prefix must be given below since the default char is * unsigned on some architectures! */ struct pcm_format_data { unsigned char width; /* bit width */ unsigned char phys; /* physical bit width */ signed char le; /* 0 = big-endian, 1 = little-endian, -1 = others */ signed char signd; /* 0 = unsigned, 1 = signed, -1 = others */ unsigned char silence[8]; /* silence data to fill */ }; /* we do lots of calculations on snd_pcm_format_t; shut up sparse */ #define INT __force int static bool valid_format(snd_pcm_format_t format) { return (INT)format >= 0 && (INT)format <= (INT)SNDRV_PCM_FORMAT_LAST; } static const struct pcm_format_data pcm_formats[(INT)SNDRV_PCM_FORMAT_LAST+1] = { [SNDRV_PCM_FORMAT_S8] = { .width = 8, .phys = 8, .le = -1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U8] = { .width = 8, .phys = 8, .le = -1, .signd = 0, .silence = { 0x80 }, }, [SNDRV_PCM_FORMAT_S16_LE] = { .width = 16, .phys = 16, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S16_BE] = { .width = 16, .phys = 16, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U16_LE] = { .width = 16, .phys = 16, .le = 1, .signd = 0, .silence = { 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U16_BE] = { .width = 16, .phys = 16, .le = 0, .signd = 0, .silence = { 0x80, 0x00 }, }, [SNDRV_PCM_FORMAT_S24_LE] = { .width = 24, .phys = 32, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S24_BE] = { .width = 24, .phys = 32, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U24_LE] = { .width = 24, .phys = 32, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U24_BE] = { .width = 24, .phys = 32, .le = 0, .signd = 0, .silence = { 0x00, 0x80, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_S32_LE] = { .width = 32, .phys = 32, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S32_BE] = { .width = 32, .phys = 32, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U32_LE] = { .width = 32, .phys = 32, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U32_BE] = { .width = 32, .phys = 32, .le = 0, .signd = 0, .silence = { 0x80, 0x00, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_FLOAT_LE] = { .width = 32, .phys = 32, .le = 1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_FLOAT_BE] = { .width = 32, .phys = 32, .le = 0, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_FLOAT64_LE] = { .width = 64, .phys = 64, .le = 1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_FLOAT64_BE] = { .width = 64, .phys = 64, .le = 0, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE] = { .width = 32, .phys = 32, .le = 1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE] = { .width = 32, .phys = 32, .le = 0, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_MU_LAW] = { .width = 8, .phys = 8, .le = -1, .signd = -1, .silence = { 0x7f }, }, [SNDRV_PCM_FORMAT_A_LAW] = { .width = 8, .phys = 8, .le = -1, .signd = -1, .silence = { 0x55 }, }, [SNDRV_PCM_FORMAT_IMA_ADPCM] = { .width = 4, .phys = 4, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_G723_24] = { .width = 3, .phys = 3, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_G723_40] = { .width = 5, .phys = 5, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_DSD_U8] = { .width = 8, .phys = 8, .le = 1, .signd = 0, .silence = { 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U16_LE] = { .width = 16, .phys = 16, .le = 1, .signd = 0, .silence = { 0x69, 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U32_LE] = { .width = 32, .phys = 32, .le = 1, .signd = 0, .silence = { 0x69, 0x69, 0x69, 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U16_BE] = { .width = 16, .phys = 16, .le = 0, .signd = 0, .silence = { 0x69, 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U32_BE] = { .width = 32, .phys = 32, .le = 0, .signd = 0, .silence = { 0x69, 0x69, 0x69, 0x69 }, }, /* FIXME: the following two formats are not defined properly yet */ [SNDRV_PCM_FORMAT_MPEG] = { .le = -1, .signd = -1, }, [SNDRV_PCM_FORMAT_GSM] = { .le = -1, .signd = -1, }, [SNDRV_PCM_FORMAT_S20_LE] = { .width = 20, .phys = 32, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S20_BE] = { .width = 20, .phys = 32, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U20_LE] = { .width = 20, .phys = 32, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x08, 0x00 }, }, [SNDRV_PCM_FORMAT_U20_BE] = { .width = 20, .phys = 32, .le = 0, .signd = 0, .silence = { 0x00, 0x08, 0x00, 0x00 }, }, /* FIXME: the following format is not defined properly yet */ [SNDRV_PCM_FORMAT_SPECIAL] = { .le = -1, .signd = -1, }, [SNDRV_PCM_FORMAT_S24_3LE] = { .width = 24, .phys = 24, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S24_3BE] = { .width = 24, .phys = 24, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U24_3LE] = { .width = 24, .phys = 24, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U24_3BE] = { .width = 24, .phys = 24, .le = 0, .signd = 0, .silence = { 0x80, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_S20_3LE] = { .width = 20, .phys = 24, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S20_3BE] = { .width = 20, .phys = 24, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U20_3LE] = { .width = 20, .phys = 24, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x08 }, }, [SNDRV_PCM_FORMAT_U20_3BE] = { .width = 20, .phys = 24, .le = 0, .signd = 0, .silence = { 0x08, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_S18_3LE] = { .width = 18, .phys = 24, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S18_3BE] = { .width = 18, .phys = 24, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U18_3LE] = { .width = 18, .phys = 24, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x02 }, }, [SNDRV_PCM_FORMAT_U18_3BE] = { .width = 18, .phys = 24, .le = 0, .signd = 0, .silence = { 0x02, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_G723_24_1B] = { .width = 3, .phys = 8, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_G723_40_1B] = { .width = 5, .phys = 8, .le = -1, .signd = -1, .silence = {}, }, }; /** * snd_pcm_format_signed - Check the PCM format is signed linear * @format: the format to check * * Return: 1 if the given PCM format is signed linear, 0 if unsigned * linear, and a negative error code for non-linear formats. */ int snd_pcm_format_signed(snd_pcm_format_t format) { int val; if (!valid_format(format)) return -EINVAL; val = pcm_formats[(INT)format].signd; if (val < 0) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_signed); /** * snd_pcm_format_unsigned - Check the PCM format is unsigned linear * @format: the format to check * * Return: 1 if the given PCM format is unsigned linear, 0 if signed * linear, and a negative error code for non-linear formats. */ int snd_pcm_format_unsigned(snd_pcm_format_t format) { int val; val = snd_pcm_format_signed(format); if (val < 0) return val; return !val; } EXPORT_SYMBOL(snd_pcm_format_unsigned); /** * snd_pcm_format_linear - Check the PCM format is linear * @format: the format to check * * Return: 1 if the given PCM format is linear, 0 if not. */ int snd_pcm_format_linear(snd_pcm_format_t format) { return snd_pcm_format_signed(format) >= 0; } EXPORT_SYMBOL(snd_pcm_format_linear); /** * snd_pcm_format_little_endian - Check the PCM format is little-endian * @format: the format to check * * Return: 1 if the given PCM format is little-endian, 0 if * big-endian, or a negative error code if endian not specified. */ int snd_pcm_format_little_endian(snd_pcm_format_t format) { int val; if (!valid_format(format)) return -EINVAL; val = pcm_formats[(INT)format].le; if (val < 0) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_little_endian); /** * snd_pcm_format_big_endian - Check the PCM format is big-endian * @format: the format to check * * Return: 1 if the given PCM format is big-endian, 0 if * little-endian, or a negative error code if endian not specified. */ int snd_pcm_format_big_endian(snd_pcm_format_t format) { int val; val = snd_pcm_format_little_endian(format); if (val < 0) return val; return !val; } EXPORT_SYMBOL(snd_pcm_format_big_endian); /** * snd_pcm_format_width - return the bit-width of the format * @format: the format to check * * Return: The bit-width of the format, or a negative error code * if unknown format. */ int snd_pcm_format_width(snd_pcm_format_t format) { int val; if (!valid_format(format)) return -EINVAL; val = pcm_formats[(INT)format].width; if (!val) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_width); /** * snd_pcm_format_physical_width - return the physical bit-width of the format * @format: the format to check * * Return: The physical bit-width of the format, or a negative error code * if unknown format. */ int snd_pcm_format_physical_width(snd_pcm_format_t format) { int val; if (!valid_format(format)) return -EINVAL; val = pcm_formats[(INT)format].phys; if (!val) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_physical_width); /** * snd_pcm_format_size - return the byte size of samples on the given format * @format: the format to check * @samples: sampling rate * * Return: The byte size of the given samples for the format, or a * negative error code if unknown format. */ ssize_t snd_pcm_format_size(snd_pcm_format_t format, size_t samples) { int phys_width = snd_pcm_format_physical_width(format); if (phys_width < 0) return -EINVAL; return samples * phys_width / 8; } EXPORT_SYMBOL(snd_pcm_format_size); /** * snd_pcm_format_silence_64 - return the silent data in 8 bytes array * @format: the format to check * * Return: The format pattern to fill or %NULL if error. */ const unsigned char *snd_pcm_format_silence_64(snd_pcm_format_t format) { if (!valid_format(format)) return NULL; if (! pcm_formats[(INT)format].phys) return NULL; return pcm_formats[(INT)format].silence; } EXPORT_SYMBOL(snd_pcm_format_silence_64); /** * snd_pcm_format_set_silence - set the silence data on the buffer * @format: the PCM format * @data: the buffer pointer * @samples: the number of samples to set silence * * Sets the silence data on the buffer for the given samples. * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_format_set_silence(snd_pcm_format_t format, void *data, unsigned int samples) { int width; unsigned char *dst; const unsigned char *pat; if (!valid_format(format)) return -EINVAL; if (samples == 0) return 0; width = pcm_formats[(INT)format].phys; /* physical width */ pat = pcm_formats[(INT)format].silence; if (!width || !pat) return -EINVAL; /* signed or 1 byte data */ if (pcm_formats[(INT)format].signd == 1 || width <= 8) { unsigned int bytes = samples * width / 8; memset(data, *pat, bytes); return 0; } /* non-zero samples, fill using a loop */ width /= 8; dst = data; #if 0 while (samples--) { memcpy(dst, pat, width); dst += width; } #else /* a bit optimization for constant width */ switch (width) { case 2: while (samples--) { memcpy(dst, pat, 2); dst += 2; } break; case 3: while (samples--) { memcpy(dst, pat, 3); dst += 3; } break; case 4: while (samples--) { memcpy(dst, pat, 4); dst += 4; } break; case 8: while (samples--) { memcpy(dst, pat, 8); dst += 8; } break; } #endif return 0; } EXPORT_SYMBOL(snd_pcm_format_set_silence); /** * snd_pcm_hw_limit_rates - determine rate_min/rate_max fields * @hw: the pcm hw instance * * Determines the rate_min and rate_max fields from the rates bits of * the given hw. * * Return: Zero if successful. */ int snd_pcm_hw_limit_rates(struct snd_pcm_hardware *hw) { int i; for (i = 0; i < (int)snd_pcm_known_rates.count; i++) { if (hw->rates & (1 << i)) { hw->rate_min = snd_pcm_known_rates.list[i]; break; } } for (i = (int)snd_pcm_known_rates.count - 1; i >= 0; i--) { if (hw->rates & (1 << i)) { hw->rate_max = snd_pcm_known_rates.list[i]; break; } } return 0; } EXPORT_SYMBOL(snd_pcm_hw_limit_rates); /** * snd_pcm_rate_to_rate_bit - converts sample rate to SNDRV_PCM_RATE_xxx bit * @rate: the sample rate to convert * * Return: The SNDRV_PCM_RATE_xxx flag that corresponds to the given rate, or * SNDRV_PCM_RATE_KNOT for an unknown rate. */ unsigned int snd_pcm_rate_to_rate_bit(unsigned int rate) { unsigned int i; for (i = 0; i < snd_pcm_known_rates.count; i++) if (snd_pcm_known_rates.list[i] == rate) return 1u << i; return SNDRV_PCM_RATE_KNOT; } EXPORT_SYMBOL(snd_pcm_rate_to_rate_bit); /** * snd_pcm_rate_bit_to_rate - converts SNDRV_PCM_RATE_xxx bit to sample rate * @rate_bit: the rate bit to convert * * Return: The sample rate that corresponds to the given SNDRV_PCM_RATE_xxx flag * or 0 for an unknown rate bit. */ unsigned int snd_pcm_rate_bit_to_rate(unsigned int rate_bit) { unsigned int i; for (i = 0; i < snd_pcm_known_rates.count; i++) if ((1u << i) == rate_bit) return snd_pcm_known_rates.list[i]; return 0; } EXPORT_SYMBOL(snd_pcm_rate_bit_to_rate); static unsigned int snd_pcm_rate_mask_sanitize(unsigned int rates) { if (rates & SNDRV_PCM_RATE_CONTINUOUS) return SNDRV_PCM_RATE_CONTINUOUS; else if (rates & SNDRV_PCM_RATE_KNOT) return SNDRV_PCM_RATE_KNOT; return rates; } /** * snd_pcm_rate_mask_intersect - computes the intersection between two rate masks * @rates_a: The first rate mask * @rates_b: The second rate mask * * This function computes the rates that are supported by both rate masks passed * to the function. It will take care of the special handling of * SNDRV_PCM_RATE_CONTINUOUS and SNDRV_PCM_RATE_KNOT. * * Return: A rate mask containing the rates that are supported by both rates_a * and rates_b. */ unsigned int snd_pcm_rate_mask_intersect(unsigned int rates_a, unsigned int rates_b) { rates_a = snd_pcm_rate_mask_sanitize(rates_a); rates_b = snd_pcm_rate_mask_sanitize(rates_b); if (rates_a & SNDRV_PCM_RATE_CONTINUOUS) return rates_b; else if (rates_b & SNDRV_PCM_RATE_CONTINUOUS) return rates_a; else if (rates_a & SNDRV_PCM_RATE_KNOT) return rates_b; else if (rates_b & SNDRV_PCM_RATE_KNOT) return rates_a; return rates_a & rates_b; } EXPORT_SYMBOL_GPL(snd_pcm_rate_mask_intersect); /** * snd_pcm_rate_range_to_bits - converts rate range to SNDRV_PCM_RATE_xxx bit * @rate_min: the minimum sample rate * @rate_max: the maximum sample rate * * This function has an implicit assumption: the rates in the given range have * only the pre-defined rates like 44100 or 16000. * * Return: The SNDRV_PCM_RATE_xxx flag that corresponds to the given rate range, * or SNDRV_PCM_RATE_KNOT for an unknown range. */ unsigned int snd_pcm_rate_range_to_bits(unsigned int rate_min, unsigned int rate_max) { unsigned int rates = 0; int i; for (i = 0; i < snd_pcm_known_rates.count; i++) { if (snd_pcm_known_rates.list[i] >= rate_min && snd_pcm_known_rates.list[i] <= rate_max) rates |= 1 << i; } if (!rates) rates = SNDRV_PCM_RATE_KNOT; return rates; } EXPORT_SYMBOL_GPL(snd_pcm_rate_range_to_bits); |
81 82 11 78 11 77 54 54 41 13 44 10 54 11 44 73 2 1 84 5 82 5 77 82 6 76 76 6 6 14 14 2 2 5 4 10 2 2 1 1 5 1 107 1 54 1543 1531 108 77 1534 33 260 35 1462 221 79 12 14 1 2 10 3 1 53 5 19 2 1 1 19 1 19 40 1 6 7 1 2 2 1 2 1 44 1 43 3 39 1 1 2 1 2 1 3 1 1 1 1 14 10 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: * Fix for packet capture - Nick Eggleston <nick@dccinc.com>; * Add HW acceleration hooks - David S. Miller <davem@redhat.com>; * Correct all the locking - David S. Miller <davem@redhat.com>; * Use hash table for VLAN groups - David S. Miller <davem@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <net/p8022.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/uaccess.h> #include <linux/if_vlan.h> #include "vlan.h" #include "vlanproc.h" #define DRV_VERSION "1.8" /* Global VLAN variables */ unsigned int vlan_net_id __read_mostly; const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; /* End of global variables definitions. */ static int vlan_group_prealloc_vid(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { struct net_device **array; unsigned int vidx; unsigned int size; int pidx; ASSERT_RTNL(); pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return -EINVAL; vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN; array = vg->vlan_devices_arrays[pidx][vidx]; if (array != NULL) return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; /* paired with smp_rmb() in __vlan_group_get_device() */ smp_wmb(); vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev, struct vlan_dev_priv *vlan) { if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_stacked_transfer_operstate(rootdev, dev); } void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct vlan_info *vlan_info; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; ASSERT_RTNL(); vlan_info = rtnl_dereference(real_dev->vlan_info); BUG_ON(!vlan_info); grp = &vlan_info->grp; grp->nr_vlan_devs--; if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_leave(dev); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_leave(dev); vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); netdev_upper_dev_unlink(real_dev, dev); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); if (grp->nr_vlan_devs == 0) { vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); } int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; } return 0; } int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; u16 vlan_id = vlan->vlan_id; struct vlan_info *vlan_info; struct vlan_group *grp; int err; err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id); if (err) return err; vlan_info = rtnl_dereference(real_dev->vlan_info); /* vlan_info should be there now. vlan_vid_add took care of it */ BUG_ON(!vlan_info); grp = &vlan_info->grp; if (grp->nr_vlan_devs == 0) { err = vlan_gvrp_init_applicant(real_dev); if (err < 0) goto out_vid_del; err = vlan_mvrp_init_applicant(real_dev); if (err < 0) goto out_uninit_gvrp; } err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id); if (err < 0) goto out_uninit_mvrp; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place * it into our local structure. */ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; return 0; out_unregister_netdev: unregister_netdevice(dev); out_uninit_mvrp: if (grp->nr_vlan_devs == 0) vlan_mvrp_uninit_applicant(real_dev); out_uninit_gvrp: if (grp->nr_vlan_devs == 0) vlan_gvrp_uninit_applicant(real_dev); out_vid_del: vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); return err; } /* Attach a VLAN device to a mac address (ie Ethernet Card). * Returns 0 if the device was created or a negative error code otherwise. */ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) { struct net_device *new_dev; struct vlan_dev_priv *vlan; struct net *net = dev_net(real_dev); struct vlan_net *vn = net_generic(net, vlan_net_id); char name[IFNAMSIZ]; int err; if (vlan_id >= VLAN_VID_MASK) return -ERANGE; err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, NULL); if (err < 0) return err; /* Gotta set up the fields for the device. */ switch (vn->name_type) { case VLAN_NAME_TYPE_RAW_PLUS_VID: /* name will look like: eth1.0005 */ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: vlan5 */ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id); break; case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: eth0.5 */ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID: /* Put our vlan.VID in the name. * Name will look like: vlan0005 */ default: snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; dev_net_set(new_dev, net); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); vlan->vlan_id = vlan_id; vlan->real_dev = real_dev; vlan->dent = NULL; vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; return 0; out_free_newdev: free_netdev(new_dev); return err; } static void vlan_sync_address(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); /* May be called without an actual change */ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; /* vlan continues to inherit address of lower device */ if (vlan_dev_inherit_address(vlandev, dev)) goto out; /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); /* vlan address was equal to the old address and is different from * the new address */ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; #if IS_ENABLED(CONFIG_FCOE) vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } static int __vlan_device_event(struct net_device *dev, unsigned long event) { int err = 0; switch (event) { case NETDEV_CHANGENAME: vlan_proc_rem_dev(dev); err = vlan_proc_add_dev(dev); break; case NETDEV_REGISTER: err = vlan_proc_add_dev(dev); break; case NETDEV_UNREGISTER: vlan_proc_rem_dev(dev); break; } return err; } static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; struct net_device *vlandev; struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); if (err) return notifier_from_errno(err); } if ((event == NETDEV_UP) && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } if (event == NETDEV_DOWN && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) goto out; grp = &vlan_info->grp; /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan_sync_address(dev, vlandev); } break; case NETDEV_CHANGEMTU: vlan_group_for_each_dev(grp, i, vlandev) { if (vlandev->mtu <= dev->mtu) continue; dev_set_mtu(vlandev, dev->mtu); } break; case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ vlan_group_for_each_dev(grp, i, vlandev) vlan_transfer_features(dev, vlandev); break; case NETDEV_DOWN: { struct net_device *tmp; LIST_HEAD(close_list); /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) list_add(&vlandev->close_list, &close_list); } dev_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); break; } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; vlan_group_for_each_dev(grp, i, vlandev) { /* removal of last vid destroys vlan_info, abort * afterwards */ if (vlan_info->nr_vids == 1) last = true; unregister_vlan_dev(vlandev, &list); if (last) break; } unregister_netdevice_many(&list); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ if (vlan_uses_dev(dev)) return NOTIFY_BAD; break; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; case NETDEV_CVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); if (err) return notifier_from_errno(err); break; case NETDEV_CVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); break; case NETDEV_SVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); if (err) return notifier_from_errno(err); break; case NETDEV_SVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); break; } out: return NOTIFY_DONE; } static struct notifier_block vlan_notifier_block __read_mostly = { .notifier_call = vlan_device_event, }; /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver * arg is really a struct vlan_ioctl_args __user *. */ static int vlan_ioctl_handler(struct net *net, void __user *arg) { int err; struct vlan_ioctl_args args; struct net_device *dev = NULL; if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) return -EFAULT; /* Null terminate this sucker, just in case. */ args.device1[sizeof(args.device1) - 1] = 0; args.u.device2[sizeof(args.u.device2) - 1] = 0; rtnl_lock(); switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: case SET_VLAN_EGRESS_PRIORITY_CMD: case SET_VLAN_FLAG_CMD: case ADD_VLAN_CMD: case DEL_VLAN_CMD: case GET_VLAN_REALDEV_NAME_CMD: case GET_VLAN_VID_CMD: err = -ENODEV; dev = __dev_get_by_name(net, args.device1); if (!dev) goto out; err = -EINVAL; if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev)) goto out; } switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, args.vlan_qos); err = 0; break; case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, args.vlan_qos); break; case SET_VLAN_FLAG_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, args.u.flag); break; case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) { struct vlan_net *vn; vn = net_generic(net, vlan_net_id); vn->name_type = args.u.name_type; err = 0; } else { err = -EINVAL; } break; case ADD_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; unregister_vlan_dev(dev, NULL); err = 0; break; case GET_VLAN_REALDEV_NAME_CMD: err = 0; vlan_dev_get_realdev_name(dev, args.u.device2, sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; case GET_VLAN_VID_CMD: err = 0; args.u.VID = vlan_dev_vlan_id(dev); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; default: err = -EOPNOTSUPP; break; } out: rtnl_unlock(); return err; } static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); int err; vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; err = vlan_proc_init(net); return err; } static void __net_exit vlan_exit_net(struct net *net) { vlan_proc_cleanup(net); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), }; static int __init vlan_proto_init(void) { int err; pr_info("%s v%s\n", vlan_fullname, vlan_version); err = register_pernet_subsys(&vlan_net_ops); if (err < 0) goto err0; err = register_netdevice_notifier(&vlan_notifier_block); if (err < 0) goto err2; err = vlan_gvrp_init(); if (err < 0) goto err3; err = vlan_mvrp_init(); if (err < 0) goto err4; err = vlan_netlink_init(); if (err < 0) goto err5; vlan_ioctl_set(vlan_ioctl_handler); return 0; err5: vlan_mvrp_uninit(); err4: vlan_gvrp_uninit(); err3: unregister_netdevice_notifier(&vlan_notifier_block); err2: unregister_pernet_subsys(&vlan_net_ops); err0: return err; } static void __exit vlan_cleanup_module(void) { vlan_ioctl_set(NULL); vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ vlan_mvrp_uninit(); vlan_gvrp_uninit(); } module_init(vlan_proto_init); module_exit(vlan_cleanup_module); MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); |
1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MLD_H #define LINUX_MLD_H #include <linux/in6.h> #include <linux/icmpv6.h> /* MLDv1 Query/Report/Done */ struct mld_msg { struct icmp6hdr mld_hdr; struct in6_addr mld_mca; }; #define mld_type mld_hdr.icmp6_type #define mld_code mld_hdr.icmp6_code #define mld_cksum mld_hdr.icmp6_cksum #define mld_maxdelay mld_hdr.icmp6_maxdelay #define mld_reserved mld_hdr.icmp6_dataun.un_data16[1] /* Multicast Listener Discovery version 2 headers */ /* MLDv2 Report */ struct mld2_grec { __u8 grec_type; __u8 grec_auxwords; __be16 grec_nsrcs; struct in6_addr grec_mca; struct in6_addr grec_src[]; }; struct mld2_report { struct icmp6hdr mld2r_hdr; struct mld2_grec mld2r_grec[]; }; #define mld2r_type mld2r_hdr.icmp6_type #define mld2r_resv1 mld2r_hdr.icmp6_code #define mld2r_cksum mld2r_hdr.icmp6_cksum #define mld2r_resv2 mld2r_hdr.icmp6_dataun.un_data16[0] #define mld2r_ngrec mld2r_hdr.icmp6_dataun.un_data16[1] /* MLDv2 Query */ struct mld2_query { struct icmp6hdr mld2q_hdr; struct in6_addr mld2q_mca; #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 mld2q_qrv:3, mld2q_suppress:1, mld2q_resv2:4; #elif defined(__BIG_ENDIAN_BITFIELD) __u8 mld2q_resv2:4, mld2q_suppress:1, mld2q_qrv:3; #else #error "Please fix <asm/byteorder.h>" #endif __u8 mld2q_qqic; __be16 mld2q_nsrcs; struct in6_addr mld2q_srcs[]; }; #define mld2q_type mld2q_hdr.icmp6_type #define mld2q_code mld2q_hdr.icmp6_code #define mld2q_cksum mld2q_hdr.icmp6_cksum #define mld2q_mrc mld2q_hdr.icmp6_maxdelay #define mld2q_resv1 mld2q_hdr.icmp6_dataun.un_data16[1] /* RFC3810, 5.1.3. Maximum Response Code: * * If Maximum Response Code >= 32768, Maximum Response Code represents a * floating-point value as follows: * * 0 1 2 3 4 5 6 7 8 9 A B C D E F * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define MLDV2_MRC_EXP(value) (((value) >> 12) & 0x0007) #define MLDV2_MRC_MAN(value) ((value) & 0x0fff) /* RFC3810, 5.1.9. QQIC (Querier's Query Interval Code): * * If QQIC >= 128, QQIC represents a floating-point value as follows: * * 0 1 2 3 4 5 6 7 * +-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+ */ #define MLDV2_QQIC_EXP(value) (((value) >> 4) & 0x07) #define MLDV2_QQIC_MAN(value) ((value) & 0x0f) #define MLD_EXP_MIN_LIMIT 32768UL #define MLDV1_MRD_MAX_COMPAT (MLD_EXP_MIN_LIMIT - 1) #define MLD_MAX_QUEUE 8 #define MLD_MAX_SKBS 32 static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) { /* RFC3810, 5.1.3. Maximum Response Code */ unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc); if (mc_mrc < MLD_EXP_MIN_LIMIT) { ret = mc_mrc; } else { unsigned long mc_man, mc_exp; mc_exp = MLDV2_MRC_EXP(mc_mrc); mc_man = MLDV2_MRC_MAN(mc_mrc); ret = (mc_man | 0x1000) << (mc_exp + 3); } return ret; } #endif |
133 11 129 124 118 1 7 17 17 14 17 11 17 6 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/termios.h> #include <linux/tty.h> #include <linux/export.h> #include "tty.h" /* * Routine which returns the baud rate of the tty * * Note that the baud_table needs to be kept in sync with the * include/asm/termbits.h file. */ static const speed_t baud_table[] = { 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200, 230400, 460800, #ifdef __sparc__ 76800, 153600, 307200, 614400, 921600, 500000, 576000, 1000000, 1152000, 1500000, 2000000 #else 500000, 576000, 921600, 1000000, 1152000, 1500000, 2000000, 2500000, 3000000, 3500000, 4000000 #endif }; static const tcflag_t baud_bits[] = { B0, B50, B75, B110, B134, B150, B200, B300, B600, B1200, B1800, B2400, B4800, B9600, B19200, B38400, B57600, B115200, B230400, B460800, #ifdef __sparc__ B76800, B153600, B307200, B614400, B921600, B500000, B576000, B1000000, B1152000, B1500000, B2000000 #else B500000, B576000, B921600, B1000000, B1152000, B1500000, B2000000, B2500000, B3000000, B3500000, B4000000 #endif }; static int n_baud_table = ARRAY_SIZE(baud_table); /** * tty_termios_baud_rate * @termios: termios structure * * Convert termios baud rate data into a speed. This should be called * with the termios lock held if this termios is a terminal termios * structure. Device drivers can call this function but should use * ->c_[io]speed directly as they are updated. * * Locking: none */ speed_t tty_termios_baud_rate(const struct ktermios *termios) { unsigned int cbaud; cbaud = termios->c_cflag & CBAUD; /* Magic token for arbitrary speed via c_ispeed/c_ospeed */ if (cbaud == BOTHER) return termios->c_ospeed; if (cbaud & CBAUDEX) { cbaud &= ~CBAUDEX; cbaud += 15; } return cbaud >= n_baud_table ? 0 : baud_table[cbaud]; } EXPORT_SYMBOL(tty_termios_baud_rate); /** * tty_termios_input_baud_rate * @termios: termios structure * * Convert termios baud rate data into a speed. This should be called * with the termios lock held if this termios is a terminal termios * structure. Device drivers can call this function but should use * ->c_[io]speed directly as they are updated. * * Locking: none */ speed_t tty_termios_input_baud_rate(const struct ktermios *termios) { unsigned int cbaud = (termios->c_cflag >> IBSHIFT) & CBAUD; if (cbaud == B0) return tty_termios_baud_rate(termios); /* Magic token for arbitrary speed via c_ispeed */ if (cbaud == BOTHER) return termios->c_ispeed; if (cbaud & CBAUDEX) { cbaud &= ~CBAUDEX; cbaud += 15; } return cbaud >= n_baud_table ? 0 : baud_table[cbaud]; } EXPORT_SYMBOL(tty_termios_input_baud_rate); /** * tty_termios_encode_baud_rate * @termios: ktermios structure holding user requested state * @ibaud: input speed * @obaud: output speed * * Encode the speeds set into the passed termios structure. This is * used as a library helper for drivers so that they can report back * the actual speed selected when it differs from the speed requested * * For maximal back compatibility with legacy SYS5/POSIX *nix behaviour * we need to carefully set the bits when the user does not get the * desired speed. We allow small margins and preserve as much of possible * of the input intent to keep compatibility. * * Locking: Caller should hold termios lock. This is already held * when calling this function from the driver termios handler. * * The ifdefs deal with platforms whose owners have yet to update them * and will all go away once this is done. */ void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, speed_t obaud) { int i = 0; int ifound = -1, ofound = -1; int iclose = ibaud/50, oclose = obaud/50; int ibinput = 0; if (obaud == 0) /* CD dropped */ ibaud = 0; /* Clear ibaud to be sure */ termios->c_ispeed = ibaud; termios->c_ospeed = obaud; if (((termios->c_cflag >> IBSHIFT) & CBAUD) != B0) ibinput = 1; /* An input speed was specified */ /* If the user asked for a precise weird speed give a precise weird * answer. If they asked for a Bfoo speed they may have problems * digesting non-exact replies so fuzz a bit. */ if ((termios->c_cflag & CBAUD) == BOTHER) { oclose = 0; if (!ibinput) iclose = 0; } if (((termios->c_cflag >> IBSHIFT) & CBAUD) == BOTHER) iclose = 0; termios->c_cflag &= ~CBAUD; termios->c_cflag &= ~(CBAUD << IBSHIFT); /* * Our goal is to find a close match to the standard baud rate * returned. Walk the baud rate table and if we get a very close * match then report back the speed as a POSIX Bxxxx value by * preference */ do { if (obaud - oclose <= baud_table[i] && obaud + oclose >= baud_table[i]) { termios->c_cflag |= baud_bits[i]; ofound = i; } if (ibaud - iclose <= baud_table[i] && ibaud + iclose >= baud_table[i]) { /* For the case input == output don't set IBAUD bits * if the user didn't do so. */ if (ofound == i && !ibinput) { ifound = i; } else { ifound = i; termios->c_cflag |= (baud_bits[i] << IBSHIFT); } } } while (++i < n_baud_table); /* If we found no match then use BOTHER. */ if (ofound == -1) termios->c_cflag |= BOTHER; /* Set exact input bits only if the input and output differ or the * user already did. */ if (ifound == -1 && (ibaud != obaud || ibinput)) termios->c_cflag |= (BOTHER << IBSHIFT); } EXPORT_SYMBOL_GPL(tty_termios_encode_baud_rate); /** * tty_encode_baud_rate - set baud rate of the tty * @tty: terminal device * @ibaud: input baud rate * @obaud: output baud rate * * Update the current termios data for the tty with the new speed * settings. The caller must hold the termios_rwsem for the tty in * question. */ void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud) { tty_termios_encode_baud_rate(&tty->termios, ibaud, obaud); } EXPORT_SYMBOL_GPL(tty_encode_baud_rate); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,ip type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ #define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); /* Type specific function prefix */ #define HTYPE hash_ipportip /* IPv4 variant */ /* Member elements */ struct hash_ipportip4_elem { __be32 ip; __be32 ip2; __be16 port; u8 proto; u8 padding; }; static bool hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, const struct hash_ipportip4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->ip2 == ip2->ip2 && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipportip4_data_list(struct sk_buff *skb, const struct hash_ipportip4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportip4_data_next(struct hash_ipportip4_elem *next, const struct hash_ipportip4_elem *d) { next->ip = d->ip; next->port = d->port; } /* Common functions */ #define MTYPE hash_ipportip4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipportip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); if (i > IPSET_MAX_RANGE) { hash_ipportip4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } } return ret; } /* IPv6 variant */ struct hash_ipportip6_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipportip6_data_list(struct sk_buff *skb, const struct hash_ipportip6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportip6_data_next(struct hash_ipportip6_elem *next, const struct hash_ipportip6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipportip6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipportip6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipportip_type __read_mostly = { .name = "hash:ip,port,ip", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, .dimension = IPSET_DIM_THREE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipportip_init(void) { return ip_set_type_register(&hash_ipportip_type); } static void __exit hash_ipportip_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipportip_type); } module_init(hash_ipportip_init); module_exit(hash_ipportip_fini); |
6 89 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MBCACHE_H #define _LINUX_MBCACHE_H #include <linux/hash.h> #include <linux/list_bl.h> #include <linux/list.h> #include <linux/atomic.h> #include <linux/fs.h> struct mb_cache; /* Cache entry flags */ enum { MBE_REFERENCED_B = 0, MBE_REUSABLE_B }; struct mb_cache_entry { /* List of entries in cache - protected by cache->c_list_lock */ struct list_head e_list; /* * Hash table list - protected by hash chain bitlock. The entry is * guaranteed to be hashed while e_refcnt > 0. */ struct hlist_bl_node e_hash_list; /* * Entry refcount. Once it reaches zero, entry is unhashed and freed. * While refcount > 0, the entry is guaranteed to stay in the hash and * e.g. mb_cache_entry_try_delete() will fail. */ atomic_t e_refcnt; /* Key in hash - stable during lifetime of the entry */ u32 e_key; unsigned long e_flags; /* User provided value - stable during lifetime of the entry */ u64 e_value; }; struct mb_cache *mb_cache_create(int bucket_bits); void mb_cache_destroy(struct mb_cache *cache); int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, u64 value, bool reusable); void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry); void mb_cache_entry_wait_unused(struct mb_cache_entry *entry); static inline void mb_cache_entry_put(struct mb_cache *cache, struct mb_cache_entry *entry) { unsigned int cnt = atomic_dec_return(&entry->e_refcnt); if (cnt > 0) { if (cnt <= 2) wake_up_var(&entry->e_refcnt); return; } __mb_cache_entry_free(cache, entry); } struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key); struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry); void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry); #endif /* _LINUX_MBCACHE_H */ |
6 1 2 6 2 5 5 3 3 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 | // SPDX-License-Identifier: GPL-2.0 /* * Opening fs-verity files * * Copyright 2019 Google LLC */ #include "fsverity_private.h" #include <linux/mm.h> #include <linux/slab.h> static struct kmem_cache *fsverity_info_cachep; /** * fsverity_init_merkle_tree_params() - initialize Merkle tree parameters * @params: the parameters struct to initialize * @inode: the inode for which the Merkle tree is being built * @hash_algorithm: number of hash algorithm to use * @log_blocksize: log base 2 of block size to use * @salt: pointer to salt (optional) * @salt_size: size of salt, possibly 0 * * Validate the hash algorithm and block size, then compute the tree topology * (num levels, num blocks in each level, etc.) and initialize @params. * * Return: 0 on success, -errno on failure */ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, const struct inode *inode, unsigned int hash_algorithm, unsigned int log_blocksize, const u8 *salt, size_t salt_size) { const struct fsverity_hash_alg *hash_alg; int err; u64 blocks; u64 blocks_in_level[FS_VERITY_MAX_LEVELS]; u64 offset; int level; memset(params, 0, sizeof(*params)); hash_alg = fsverity_get_hash_alg(inode, hash_algorithm); if (IS_ERR(hash_alg)) return PTR_ERR(hash_alg); params->hash_alg = hash_alg; params->digest_size = hash_alg->digest_size; params->hashstate = fsverity_prepare_hash_state(hash_alg, salt, salt_size); if (IS_ERR(params->hashstate)) { err = PTR_ERR(params->hashstate); params->hashstate = NULL; fsverity_err(inode, "Error %d preparing hash state", err); goto out_err; } /* * fs/verity/ directly assumes that the Merkle tree block size is a * power of 2 less than or equal to PAGE_SIZE. Another restriction * arises from the interaction between fs/verity/ and the filesystems * themselves: filesystems expect to be able to verify a single * filesystem block of data at a time. Therefore, the Merkle tree block * size must also be less than or equal to the filesystem block size. * * The above are the only hard limitations, so in theory the Merkle tree * block size could be as small as twice the digest size. However, * that's not useful, and it would result in some unusually deep and * large Merkle trees. So we currently require that the Merkle tree * block size be at least 1024 bytes. That's small enough to test the * sub-page block case on systems with 4K pages, but not too small. */ if (log_blocksize < 10 || log_blocksize > PAGE_SHIFT || log_blocksize > inode->i_blkbits) { fsverity_warn(inode, "Unsupported log_blocksize: %u", log_blocksize); err = -EINVAL; goto out_err; } params->log_blocksize = log_blocksize; params->block_size = 1 << log_blocksize; params->log_blocks_per_page = PAGE_SHIFT - log_blocksize; params->blocks_per_page = 1 << params->log_blocks_per_page; if (WARN_ON_ONCE(!is_power_of_2(params->digest_size))) { err = -EINVAL; goto out_err; } if (params->block_size < 2 * params->digest_size) { fsverity_warn(inode, "Merkle tree block size (%u) too small for hash algorithm \"%s\"", params->block_size, hash_alg->name); err = -EINVAL; goto out_err; } params->log_digestsize = ilog2(params->digest_size); params->log_arity = log_blocksize - params->log_digestsize; params->hashes_per_block = 1 << params->log_arity; /* * Compute the number of levels in the Merkle tree and create a map from * level to the starting block of that level. Level 'num_levels - 1' is * the root and is stored first. Level 0 is the level directly "above" * the data blocks and is stored last. */ /* Compute number of levels and the number of blocks in each level */ blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize; while (blocks > 1) { if (params->num_levels >= FS_VERITY_MAX_LEVELS) { fsverity_err(inode, "Too many levels in Merkle tree"); err = -EFBIG; goto out_err; } blocks = (blocks + params->hashes_per_block - 1) >> params->log_arity; blocks_in_level[params->num_levels++] = blocks; } /* Compute the starting block of each level */ offset = 0; for (level = (int)params->num_levels - 1; level >= 0; level--) { params->level_start[level] = offset; offset += blocks_in_level[level]; } /* * With block_size != PAGE_SIZE, an in-memory bitmap will need to be * allocated to track the "verified" status of hash blocks. Don't allow * this bitmap to get too large. For now, limit it to 1 MiB, which * limits the file size to about 4.4 TB with SHA-256 and 4K blocks. * * Together with the fact that the data, and thus also the Merkle tree, * cannot have more than ULONG_MAX pages, this implies that hash block * indices can always fit in an 'unsigned long'. But to be safe, we * explicitly check for that too. Note, this is only for hash block * indices; data block indices might not fit in an 'unsigned long'. */ if ((params->block_size != PAGE_SIZE && offset > 1 << 23) || offset > ULONG_MAX) { fsverity_err(inode, "Too many blocks in Merkle tree"); err = -EFBIG; goto out_err; } params->tree_size = offset << log_blocksize; params->tree_pages = PAGE_ALIGN(params->tree_size) >> PAGE_SHIFT; return 0; out_err: kfree(params->hashstate); memset(params, 0, sizeof(*params)); return err; } /* * Compute the file digest by hashing the fsverity_descriptor excluding the * builtin signature and with the sig_size field set to 0. */ static int compute_file_digest(const struct fsverity_hash_alg *hash_alg, struct fsverity_descriptor *desc, u8 *file_digest) { __le32 sig_size = desc->sig_size; int err; desc->sig_size = 0; err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest); desc->sig_size = sig_size; return err; } /* * Create a new fsverity_info from the given fsverity_descriptor (with optional * appended builtin signature), and check the signature if present. The * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, struct fsverity_descriptor *desc) { struct fsverity_info *vi; int err; vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL); if (!vi) return ERR_PTR(-ENOMEM); vi->inode = inode; err = fsverity_init_merkle_tree_params(&vi->tree_params, inode, desc->hash_algorithm, desc->log_blocksize, desc->salt, desc->salt_size); if (err) { fsverity_err(inode, "Error %d initializing Merkle tree parameters", err); goto fail; } memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); err = compute_file_digest(vi->tree_params.hash_alg, desc, vi->file_digest); if (err) { fsverity_err(inode, "Error %d computing file digest", err); goto fail; } err = fsverity_verify_signature(vi, desc->signature, le32_to_cpu(desc->sig_size)); if (err) goto fail; if (vi->tree_params.block_size != PAGE_SIZE) { /* * When the Merkle tree block size and page size differ, we use * a bitmap to keep track of which hash blocks have been * verified. This bitmap must contain one bit per hash block, * including alignment to a page boundary at the end. * * Eventually, to support extremely large files in an efficient * way, it might be necessary to make pages of this bitmap * reclaimable. But for now, simply allocating the whole bitmap * is a simple solution that works well on the files on which * fsverity is realistically used. E.g., with SHA-256 and 4K * blocks, a 100MB file only needs a 24-byte bitmap, and the * bitmap for any file under 17GB fits in a 4K page. */ unsigned long num_bits = vi->tree_params.tree_pages << vi->tree_params.log_blocks_per_page; vi->hash_block_verified = kvcalloc(BITS_TO_LONGS(num_bits), sizeof(unsigned long), GFP_KERNEL); if (!vi->hash_block_verified) { err = -ENOMEM; goto fail; } } return vi; fail: fsverity_free_info(vi); return ERR_PTR(err); } void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* * Multiple tasks may race to set ->i_verity_info, so use * cmpxchg_release(). This pairs with the smp_load_acquire() in * fsverity_get_info(). I.e., here we publish ->i_verity_info with a * RELEASE barrier so that other tasks can ACQUIRE it. */ if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { /* Lost the race, so free the fsverity_info we allocated. */ fsverity_free_info(vi); /* * Afterwards, the caller may access ->i_verity_info directly, * so make sure to ACQUIRE the winning fsverity_info. */ (void)fsverity_get_info(inode); } } void fsverity_free_info(struct fsverity_info *vi) { if (!vi) return; kfree(vi->tree_params.hashstate); kvfree(vi->hash_block_verified); kmem_cache_free(fsverity_info_cachep, vi); } static bool validate_fsverity_descriptor(struct inode *inode, const struct fsverity_descriptor *desc, size_t desc_size) { if (desc_size < sizeof(*desc)) { fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", desc_size); return false; } if (desc->version != 1) { fsverity_err(inode, "Unrecognized descriptor version: %u", desc->version); return false; } if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { fsverity_err(inode, "Reserved bits set in descriptor"); return false; } if (desc->salt_size > sizeof(desc->salt)) { fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); return false; } if (le64_to_cpu(desc->data_size) != inode->i_size) { fsverity_err(inode, "Wrong data_size: %llu (desc) != %lld (inode)", le64_to_cpu(desc->data_size), inode->i_size); return false; } if (le32_to_cpu(desc->sig_size) > desc_size - sizeof(*desc)) { fsverity_err(inode, "Signature overflows verity descriptor"); return false; } return true; } /* * Read the inode's fsverity_descriptor (with optional appended builtin * signature) from the filesystem, and do basic validation of it. */ int fsverity_get_descriptor(struct inode *inode, struct fsverity_descriptor **desc_ret) { int res; struct fsverity_descriptor *desc; res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0); if (res < 0) { fsverity_err(inode, "Error %d getting verity descriptor size", res); return res; } if (res > FS_VERITY_MAX_DESCRIPTOR_SIZE) { fsverity_err(inode, "Verity descriptor is too large (%d bytes)", res); return -EMSGSIZE; } desc = kmalloc(res, GFP_KERNEL); if (!desc) return -ENOMEM; res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res); if (res < 0) { fsverity_err(inode, "Error %d reading verity descriptor", res); kfree(desc); return res; } if (!validate_fsverity_descriptor(inode, desc, res)) { kfree(desc); return -EINVAL; } *desc_ret = desc; return 0; } /* Ensure the inode has an ->i_verity_info */ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); struct fsverity_descriptor *desc; int err; if (vi) return 0; err = fsverity_get_descriptor(inode, &desc); if (err) return err; vi = fsverity_create_info(inode, desc); if (IS_ERR(vi)) { err = PTR_ERR(vi); goto out_free_desc; } fsverity_set_info(inode, vi); err = 0; out_free_desc: kfree(desc); return err; } int __fsverity_file_open(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE) return -EPERM; return ensure_verity_info(inode); } EXPORT_SYMBOL_GPL(__fsverity_file_open); int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (attr->ia_valid & ATTR_SIZE) return -EPERM; return 0; } EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); void __fsverity_cleanup_inode(struct inode *inode) { fsverity_free_info(inode->i_verity_info); inode->i_verity_info = NULL; } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); void __init fsverity_init_info_cache(void) { fsverity_info_cachep = KMEM_CACHE_USERCOPY( fsverity_info, SLAB_RECLAIM_ACCOUNT | SLAB_PANIC, file_digest); } |
5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 | // SPDX-License-Identifier: GPL-2.0 /* * NFS exporting and validation. * * We maintain a list of clients, each of which has a list of * exports. To export an fs to a given client, you first have * to create the client entry with NFSCTL_ADDCLIENT, which * creates a client control block and adds it to the hash * table. Then, you call NFSCTL_EXPORT for each fs. * * * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> */ #include <linux/slab.h> #include <linux/namei.h> #include <linux/module.h> #include <linux/exportfs.h> #include <linux/sunrpc/svc_xprt.h> #include "nfsd.h" #include "nfsfh.h" #include "netns.h" #include "pnfs.h" #include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT /* * We have two caches. * One maps client+vfsmnt+dentry to export options - the export map * The other maps client+filehandle-fragment to export options. - the expkey map * * The export options are actually stored in the first map, and the * second map contains a reference to the entry in the first map. */ #define EXPKEY_HASHBITS 8 #define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS) #define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) static void expkey_put(struct kref *ref) { struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); if (test_bit(CACHE_VALID, &key->h.flags) && !test_bit(CACHE_NEGATIVE, &key->h.flags)) path_put(&key->ek_path); auth_domain_put(key->ek_client); kfree_rcu(key, ek_rcu); } static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall(cd, h); } static void expkey_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { /* client fsidtype \xfsid */ struct svc_expkey *ek = container_of(h, struct svc_expkey, h); char type[5]; qword_add(bpp, blen, ek->ek_client->name); snprintf(type, 5, "%d", ek->ek_fsidtype); qword_add(bpp, blen, type); qword_addhex(bpp, blen, (char*)ek->ek_fsid, key_len(ek->ek_fsidtype)); (*bpp)[-1] = '\n'; } static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, struct svc_expkey *old); static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *); static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client fsidtype fsid expiry [path] */ char *buf; int len; struct auth_domain *dom = NULL; int err; int fsidtype; char *ep; struct svc_expkey key; struct svc_expkey *ek = NULL; if (mesg[mlen - 1] != '\n') return -EINVAL; mesg[mlen-1] = 0; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); err = -ENOMEM; if (!buf) goto out; err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; dom = auth_domain_find(buf); if (!dom) goto out; dprintk("found domain %s\n", buf); err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; fsidtype = simple_strtoul(buf, &ep, 10); if (*ep) goto out; dprintk("found fsidtype %d\n", fsidtype); if (key_len(fsidtype)==0) /* invalid type */ goto out; if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) goto out; dprintk("found fsid length %d\n", len); if (len != key_len(fsidtype)) goto out; /* OK, we seem to have a valid key */ key.h.flags = 0; err = get_expiry(&mesg, &key.h.expiry_time); if (err) goto out; key.ek_client = dom; key.ek_fsidtype = fsidtype; memcpy(key.ek_fsid, buf, len); ek = svc_expkey_lookup(cd, &key); err = -ENOMEM; if (!ek) goto out; /* now we want a pathname, or empty meaning NEGATIVE */ err = -EINVAL; len = qword_get(&mesg, buf, PAGE_SIZE); if (len < 0) goto out; dprintk("Path seems to be <%s>\n", buf); err = 0; if (len == 0) { set_bit(CACHE_NEGATIVE, &key.h.flags); ek = svc_expkey_update(cd, &key, ek); if (ek) trace_nfsd_expkey_update(ek, NULL); else err = -ENOMEM; } else { err = kern_path(buf, 0, &key.ek_path); if (err) goto out; dprintk("Found the path %s\n", buf); ek = svc_expkey_update(cd, &key, ek); if (ek) trace_nfsd_expkey_update(ek, buf); else err = -ENOMEM; path_put(&key.ek_path); } cache_flush(); out: if (ek) cache_put(&ek->h, cd); if (dom) auth_domain_put(dom); kfree(buf); return err; } static int expkey_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct svc_expkey *ek ; int i; if (h ==NULL) { seq_puts(m, "#domain fsidtype fsid [path]\n"); return 0; } ek = container_of(h, struct svc_expkey, h); seq_printf(m, "%s %d 0x", ek->ek_client->name, ek->ek_fsidtype); for (i=0; i < key_len(ek->ek_fsidtype)/4; i++) seq_printf(m, "%08x", ek->ek_fsid[i]); if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { seq_printf(m, " "); seq_path(m, &ek->ek_path, "\\ \t\n"); } seq_printf(m, "\n"); return 0; } static inline int expkey_match (struct cache_head *a, struct cache_head *b) { struct svc_expkey *orig = container_of(a, struct svc_expkey, h); struct svc_expkey *new = container_of(b, struct svc_expkey, h); if (orig->ek_fsidtype != new->ek_fsidtype || orig->ek_client != new->ek_client || memcmp(orig->ek_fsid, new->ek_fsid, key_len(orig->ek_fsidtype)) != 0) return 0; return 1; } static inline void expkey_init(struct cache_head *cnew, struct cache_head *citem) { struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); struct svc_expkey *item = container_of(citem, struct svc_expkey, h); kref_get(&item->ek_client->ref); new->ek_client = item->ek_client; new->ek_fsidtype = item->ek_fsidtype; memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid)); } static inline void expkey_update(struct cache_head *cnew, struct cache_head *citem) { struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); struct svc_expkey *item = container_of(citem, struct svc_expkey, h); new->ek_path = item->ek_path; path_get(&item->ek_path); } static struct cache_head *expkey_alloc(void) { struct svc_expkey *i = kmalloc(sizeof(*i), GFP_KERNEL); if (i) return &i->h; else return NULL; } static void expkey_flush(void) { /* * Take the nfsd_mutex here to ensure that the file cache is not * destroyed while we're in the middle of flushing. */ mutex_lock(&nfsd_mutex); nfsd_file_cache_purge(current->nsproxy->net_ns); mutex_unlock(&nfsd_mutex); } static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", .cache_put = expkey_put, .cache_upcall = expkey_upcall, .cache_request = expkey_request, .cache_parse = expkey_parse, .cache_show = expkey_show, .match = expkey_match, .init = expkey_init, .update = expkey_update, .alloc = expkey_alloc, .flush = expkey_flush, }; static int svc_expkey_hash(struct svc_expkey *item) { int hash = item->ek_fsidtype; char * cp = (char*)item->ek_fsid; int len = key_len(item->ek_fsidtype); hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); hash &= EXPKEY_HASHMASK; return hash; } static struct svc_expkey * svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item) { struct cache_head *ch; int hash = svc_expkey_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct svc_expkey, h); else return NULL; } static struct svc_expkey * svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, struct svc_expkey *old) { struct cache_head *ch; int hash = svc_expkey_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct svc_expkey, h); else return NULL; } #define EXPORT_HASHBITS 8 #define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) { struct nfsd4_fs_location *locations = fsloc->locations; int i; if (!locations) return; for (i = 0; i < fsloc->locations_count; i++) { kfree(locations[i].path); kfree(locations[i].hosts); } kfree(locations); fsloc->locations = NULL; } static int export_stats_init(struct export_stats *stats) { stats->start_time = ktime_get_seconds(); return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM); } static void export_stats_reset(struct export_stats *stats) { if (stats) nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM); } static void export_stats_destroy(struct export_stats *stats) { if (stats) nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM); } static void svc_export_put(struct kref *ref) { struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); export_stats_destroy(exp->ex_stats); kfree(exp->ex_stats); kfree(exp->ex_uuid); kfree_rcu(exp, ex_rcu); } static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall(cd, h); } static void svc_export_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { /* client path */ struct svc_export *exp = container_of(h, struct svc_export, h); char *pth; qword_add(bpp, blen, exp->ex_client->name); pth = d_path(&exp->ex_path, *bpp, *blen); if (IS_ERR(pth)) { /* is this correct? */ (*bpp)[0] = '\n'; return; } qword_add(bpp, blen, pth); (*bpp)[-1] = '\n'; } static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); static int check_export(struct path *path, int *flags, unsigned char *uuid) { struct inode *inode = d_inode(path->dentry); /* * We currently export only dirs, regular files, and (for v4 * pseudoroot) symlinks. */ if (!S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode) && !S_ISREG(inode->i_mode)) return -ENOTDIR; /* * Mountd should never pass down a writeable V4ROOT export, but, * just to make sure: */ if (*flags & NFSEXP_V4ROOT) *flags |= NFSEXP_READONLY; /* There are two requirements on a filesystem to be exportable. * 1: We must be able to identify the filesystem from a number. * either a device number (so FS_REQUIRES_DEV needed) * or an FSID number (so NFSEXP_FSID or ->uuid is needed). * 2: We must be able to find an inode from a filehandle. * This means that s_export_op must be set. * 3: We must not currently be on an idmapped mount. */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && !(*flags & NFSEXP_FSID) && uuid == NULL) { dprintk("exp_export: export of non-dev fs without fsid\n"); return -EINVAL; } if (!exportfs_can_decode_fh(inode->i_sb->s_export_op)) { dprintk("exp_export: export of invalid fs type.\n"); return -EINVAL; } if (is_idmapped_mnt(path->mnt)) { dprintk("exp_export: export of idmapped mounts not yet supported.\n"); return -EINVAL; } if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK && !(*flags & NFSEXP_NOSUBTREECHECK)) { dprintk("%s: %s does not support subtree checking!\n", __func__, inode->i_sb->s_type->name); return -EINVAL; } return 0; } #ifdef CONFIG_NFSD_V4 static int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { int len; int migrated, i, err; /* more than one fsloc */ if (fsloc->locations) return -EINVAL; /* listsize */ err = get_uint(mesg, &fsloc->locations_count); if (err) return err; if (fsloc->locations_count > MAX_FS_LOCATIONS) return -EINVAL; if (fsloc->locations_count == 0) return 0; fsloc->locations = kcalloc(fsloc->locations_count, sizeof(struct nfsd4_fs_location), GFP_KERNEL); if (!fsloc->locations) return -ENOMEM; for (i=0; i < fsloc->locations_count; i++) { /* colon separated host list */ err = -EINVAL; len = qword_get(mesg, buf, PAGE_SIZE); if (len <= 0) goto out_free_all; err = -ENOMEM; fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL); if (!fsloc->locations[i].hosts) goto out_free_all; err = -EINVAL; /* slash separated path component list */ len = qword_get(mesg, buf, PAGE_SIZE); if (len <= 0) goto out_free_all; err = -ENOMEM; fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL); if (!fsloc->locations[i].path) goto out_free_all; } /* migrated */ err = get_int(mesg, &migrated); if (err) goto out_free_all; err = -EINVAL; if (migrated < 0 || migrated > 1) goto out_free_all; fsloc->migrated = migrated; return 0; out_free_all: nfsd4_fslocs_free(fsloc); return err; } static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { struct exp_flavor_info *f; u32 listsize; int err; /* more than one secinfo */ if (exp->ex_nflavors) return -EINVAL; err = get_uint(mesg, &listsize); if (err) return err; if (listsize > MAX_SECINFO_LIST) return -EINVAL; for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { err = get_uint(mesg, &f->pseudoflavor); if (err) return err; /* * XXX: It would be nice to also check whether this * pseudoflavor is supported, so we can discover the * problem at export time instead of when a client fails * to authenticate. */ err = get_uint(mesg, &f->flags); if (err) return err; /* Only some flags are allowed to differ between flavors: */ if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags)) return -EINVAL; } exp->ex_nflavors = listsize; return 0; } #else /* CONFIG_NFSD_V4 */ static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;} static inline int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp) { unsigned int i, mode, listsize; int err; err = get_uint(mesg, &listsize); if (err) return err; if (listsize > NFSEXP_XPRTSEC_NUM) return -EINVAL; exp->ex_xprtsec_modes = 0; for (i = 0; i < listsize; i++) { err = get_uint(mesg, &mode); if (err) return err; if (mode > NFSEXP_XPRTSEC_MTLS) return -EINVAL; exp->ex_xprtsec_modes |= mode; } return 0; } static inline int nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) { int len; /* more than one uuid */ if (*puuid) return -EINVAL; /* expect a 16 byte uuid encoded as \xXXXX... */ len = qword_get(mesg, buf, PAGE_SIZE); if (len != EX_UUID_LEN) return -EINVAL; *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL); if (*puuid == NULL) return -ENOMEM; return 0; } static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ char *buf; int err; struct auth_domain *dom = NULL; struct svc_export exp = {}, *expp; int an_int; if (mesg[mlen-1] != '\n') return -EINVAL; mesg[mlen-1] = 0; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; /* client */ err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; dom = auth_domain_find(buf); if (!dom) goto out; /* path */ err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out1; err = kern_path(buf, 0, &exp.ex_path); if (err) goto out1; exp.ex_client = dom; exp.cd = cd; exp.ex_devid_map = NULL; exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL; /* expiry */ err = get_expiry(&mesg, &exp.h.expiry_time); if (err) goto out3; /* flags */ err = get_int(&mesg, &an_int); if (err == -ENOENT) { err = 0; set_bit(CACHE_NEGATIVE, &exp.h.flags); } else { if (err || an_int < 0) goto out3; exp.ex_flags= an_int; /* anon uid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_anon_uid= make_kuid(current_user_ns(), an_int); /* anon gid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_anon_gid= make_kgid(current_user_ns(), an_int); /* fsid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_fsid = an_int; while (qword_get(&mesg, buf, PAGE_SIZE) > 0) { if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); else if (strcmp(buf, "uuid") == 0) err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); else if (strcmp(buf, "xprtsec") == 0) err = xprtsec_parse(&mesg, buf, &exp); else /* quietly ignore unknown words and anything * following. Newer user-space can try to set * new values, then see what the result was. */ break; if (err) goto out4; } err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; /* * No point caching this if it would immediately expire. * Also, this protects exportfs's dummy export from the * anon_uid/anon_gid checks: */ if (exp.h.expiry_time < seconds_since_boot()) goto out4; /* * For some reason exportfs has been passing down an * invalid (-1) uid & gid on the "dummy" export which it * uses to test export support. To make sure exportfs * sees errors from check_export we therefore need to * delay these checks till after check_export: */ err = -EINVAL; if (!uid_valid(exp.ex_anon_uid)) goto out4; if (!gid_valid(exp.ex_anon_gid)) goto out4; err = 0; nfsd4_setup_layout_type(&exp); } expp = svc_export_lookup(&exp); if (!expp) { err = -ENOMEM; goto out4; } expp = svc_export_update(&exp, expp); if (expp) { trace_nfsd_export_update(expp); cache_flush(); exp_put(expp); } else err = -ENOMEM; out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); out3: path_put(&exp.ex_path); out1: auth_domain_put(dom); out: kfree(buf); return err; } static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); static void show_secinfo(struct seq_file *m, struct svc_export *exp); static int is_export_stats_file(struct seq_file *m) { /* * The export_stats file uses the same ops as the exports file. * We use the file's name to determine the reported info per export. * There is no rename in nsfdfs, so d_name.name is stable. */ return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats"); } static int svc_export_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct svc_export *exp; bool export_stats = is_export_stats_file(m); if (h == NULL) { if (export_stats) seq_puts(m, "#path domain start-time\n#\tstats\n"); else seq_puts(m, "#path domain(flags)\n"); return 0; } exp = container_of(h, struct svc_export, h); seq_path(m, &exp->ex_path, " \t\n\\"); seq_putc(m, '\t'); seq_escape(m, exp->ex_client->name, " \t\n\\"); if (export_stats) { struct percpu_counter *counter = exp->ex_stats->counter; seq_printf(m, "\t%lld\n", exp->ex_stats->start_time); seq_printf(m, "\tfh_stale: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE])); seq_printf(m, "\tio_read: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ])); seq_printf(m, "\tio_write: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE])); seq_putc(m, '\n'); return 0; } seq_putc(m, '('); if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { exp_flags(m, exp->ex_flags, exp->ex_fsid, exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); if (exp->ex_uuid) { int i; seq_puts(m, ",uuid="); for (i = 0; i < EX_UUID_LEN; i++) { if ((i&3) == 0 && i) seq_putc(m, ':'); seq_printf(m, "%02x", exp->ex_uuid[i]); } } show_secinfo(m, exp); } seq_puts(m, ")\n"); return 0; } static int svc_export_match(struct cache_head *a, struct cache_head *b) { struct svc_export *orig = container_of(a, struct svc_export, h); struct svc_export *new = container_of(b, struct svc_export, h); return orig->ex_client == new->ex_client && path_equal(&orig->ex_path, &new->ex_path); } static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) { struct svc_export *new = container_of(cnew, struct svc_export, h); struct svc_export *item = container_of(citem, struct svc_export, h); kref_get(&item->ex_client->ref); new->ex_client = item->ex_client; new->ex_path = item->ex_path; path_get(&item->ex_path); new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; export_stats_reset(new->ex_stats); } static void export_update(struct cache_head *cnew, struct cache_head *citem) { struct svc_export *new = container_of(cnew, struct svc_export, h); struct svc_export *item = container_of(citem, struct svc_export, h); int i; new->ex_flags = item->ex_flags; new->ex_anon_uid = item->ex_anon_uid; new->ex_anon_gid = item->ex_anon_gid; new->ex_fsid = item->ex_fsid; new->ex_devid_map = item->ex_devid_map; item->ex_devid_map = NULL; new->ex_uuid = item->ex_uuid; item->ex_uuid = NULL; new->ex_fslocs.locations = item->ex_fslocs.locations; item->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; item->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = item->ex_fslocs.migrated; item->ex_fslocs.migrated = 0; new->ex_layout_types = item->ex_layout_types; new->ex_nflavors = item->ex_nflavors; for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; } new->ex_xprtsec_modes = item->ex_xprtsec_modes; } static struct cache_head *svc_export_alloc(void) { struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL); if (!i) return NULL; i->ex_stats = kmalloc(sizeof(*(i->ex_stats)), GFP_KERNEL); if (!i->ex_stats) { kfree(i); return NULL; } if (export_stats_init(i->ex_stats)) { kfree(i->ex_stats); kfree(i); return NULL; } return &i->h; } static const struct cache_detail svc_export_cache_template = { .owner = THIS_MODULE, .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", .cache_put = svc_export_put, .cache_upcall = svc_export_upcall, .cache_request = svc_export_request, .cache_parse = svc_export_parse, .cache_show = svc_export_show, .match = svc_export_match, .init = svc_export_init, .update = export_update, .alloc = svc_export_alloc, }; static int svc_export_hash(struct svc_export *exp) { int hash; hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); return hash; } static struct svc_export * svc_export_lookup(struct svc_export *exp) { struct cache_head *ch; int hash = svc_export_hash(exp); ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash); if (ch) return container_of(ch, struct svc_export, h); else return NULL; } static struct svc_export * svc_export_update(struct svc_export *new, struct svc_export *old) { struct cache_head *ch; int hash = svc_export_hash(old); ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct svc_export, h); else return NULL; } static struct svc_expkey * exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_expkey key, *ek; int err; if (!clp) return ERR_PTR(-ENOENT); key.ek_client = clp; key.ek_fsidtype = fsid_type; memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); ek = svc_expkey_lookup(cd, &key); if (ek == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &ek->h, reqp); if (err) { trace_nfsd_exp_find_key(&key, err); return ERR_PTR(err); } return ek; } static struct svc_export * exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, const struct path *path, struct cache_req *reqp) { struct svc_export *exp, key; int err; if (!clp) return ERR_PTR(-ENOENT); key.ex_client = clp; key.ex_path = *path; key.cd = cd; exp = svc_export_lookup(&key); if (exp == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &exp->h, reqp); if (err) { trace_nfsd_exp_get_by_name(&key, err); return ERR_PTR(err); } return exp; } /* * Find the export entry for a given dentry. */ static struct svc_export * exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path) { struct dentry *saved = dget(path->dentry); struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { struct dentry *parent = dget_parent(path->dentry); dput(path->dentry); path->dentry = parent; exp = exp_get_by_name(cd, clp, path, NULL); } dput(path->dentry); path->dentry = saved; return exp; } /* * Obtain the root fh on behalf of a client. * This could be done in user space, but I feel that it adds some safety * since its harder to fool a kernel module than a user space program. */ int exp_rootfh(struct net *net, struct auth_domain *clp, char *name, struct knfsd_fh *f, int maxsize) { struct svc_export *exp; struct path path; struct inode *inode; struct svc_fh fh; int err; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; err = -EPERM; /* NB: we probably ought to check that it's NUL-terminated */ if (kern_path(name, 0, &path)) { printk("nfsd: exp_rootfh path not found %s", name); return err; } inode = d_inode(path.dentry); dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", name, path.dentry, clp->name, inode->i_sb->s_id, inode->i_ino); exp = exp_parent(cd, clp, &path); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto out; } /* * fh must be initialized before calling fh_compose */ fh_init(&fh, maxsize); if (fh_compose(&fh, exp, path.dentry, NULL)) err = -EINVAL; else err = 0; memcpy(f, &fh.fh_handle, sizeof(struct knfsd_fh)); fh_put(&fh); exp_put(exp); out: path_put(&path); return err; } static struct svc_export *exp_find(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_export *exp; struct nfsd_net *nn = net_generic(cd->net, nfsd_net_id); struct svc_expkey *ek = exp_find_key(nn->svc_expkey_cache, clp, fsid_type, fsidv, reqp); if (IS_ERR(ek)) return ERR_CAST(ek); exp = exp_get_by_name(cd, clp, &ek->ek_path, reqp); cache_put(&ek->h, nn->svc_expkey_cache); if (IS_ERR(exp)) return ERR_CAST(exp); return exp; } __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) { struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; struct svc_xprt *xprt = rqstp->rq_xprt; if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) { if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) goto ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) goto ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) goto ok; } goto denied; ok: /* legacy gss-only clients are always OK: */ if (exp->ex_client == rqstp->rq_gssclient) return 0; /* ip-address based client; check sec= export option: */ for (f = exp->ex_flavors; f < end; f++) { if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) return 0; } /* defaults in absence of sec= options: */ if (exp->ex_nflavors == 0) { if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX) return 0; } /* If the compound op contains a spo_must_allowed op, * it will be sent with integrity/protection which * will have to be expressly allowed on mounts that * don't support it */ if (nfsd4_spo_must_allow(rqstp)) return 0; denied: return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec; } /* * Uses rq_client and rq_gssclient to find an export; uses rq_client (an * auth_unix client) if it's available and has secinfo information; * otherwise, will try to use rq_gssclient. * * Called from functions that handle requests; functions that do work on * behalf of mountd are passed a single client name to use, and should * use exp_get_by_name() or exp_find(). */ struct svc_export * rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; if (rqstp->rq_client == NULL) goto gss; /* First try the auth_unix client: */ exp = exp_get_by_name(cd, rqstp->rq_client, path, &rqstp->rq_chandle); if (PTR_ERR(exp) == -ENOENT) goto gss; if (IS_ERR(exp)) return exp; /* If it has secinfo, assume there are no gss/... clients */ if (exp->ex_nflavors > 0) return exp; gss: /* Otherwise, try falling back on gss client */ if (rqstp->rq_gssclient == NULL) return exp; gssexp = exp_get_by_name(cd, rqstp->rq_gssclient, path, &rqstp->rq_chandle); if (PTR_ERR(gssexp) == -ENOENT) return exp; if (!IS_ERR(exp)) exp_put(exp); return gssexp; } struct svc_export * rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; if (rqstp->rq_client == NULL) goto gss; /* First try the auth_unix client: */ exp = exp_find(cd, rqstp->rq_client, fsid_type, fsidv, &rqstp->rq_chandle); if (PTR_ERR(exp) == -ENOENT) goto gss; if (IS_ERR(exp)) return exp; /* If it has secinfo, assume there are no gss/... clients */ if (exp->ex_nflavors > 0) return exp; gss: /* Otherwise, try falling back on gss client */ if (rqstp->rq_gssclient == NULL) return exp; gssexp = exp_find(cd, rqstp->rq_gssclient, fsid_type, fsidv, &rqstp->rq_chandle); if (PTR_ERR(gssexp) == -ENOENT) return exp; if (!IS_ERR(exp)) exp_put(exp); return gssexp; } struct svc_export * rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) { struct dentry *saved = dget(path->dentry); struct svc_export *exp = rqst_exp_get_by_name(rqstp, path); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { struct dentry *parent = dget_parent(path->dentry); dput(path->dentry); path->dentry = parent; exp = rqst_exp_get_by_name(rqstp, path); } dput(path->dentry); path->dentry = saved; return exp; } struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp) { u32 fsidv[2]; mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); return rqst_exp_find(rqstp, FSID_NUM, fsidv); } /* * Called when we need the filehandle for the root of the pseudofs, * for a given NFSv4 client. The root is defined to be the * export point with fsid==0 */ __be32 exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) { struct svc_export *exp; __be32 rv; exp = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); exp_put(exp); return rv; } static struct flags { int flag; char *name[2]; } expflags[] = { { NFSEXP_READONLY, {"ro", "rw"}}, { NFSEXP_INSECURE_PORT, {"insecure", ""}}, { NFSEXP_ROOTSQUASH, {"root_squash", "no_root_squash"}}, { NFSEXP_ALLSQUASH, {"all_squash", ""}}, { NFSEXP_ASYNC, {"async", "sync"}}, { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, { NFSEXP_NOHIDE, {"nohide", ""}}, { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, { NFSEXP_PNFS, {"pnfs", ""}}, { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, { 0, {"", ""}} }; static void show_expflags(struct seq_file *m, int flags, int mask) { struct flags *flg; int state, first = 0; for (flg = expflags; flg->flag; flg++) { if (flg->flag & ~mask) continue; state = (flg->flag & flags) ? 0 : 1; if (*flg->name[state]) seq_printf(m, "%s%s", first++?",":"", flg->name[state]); } } static void show_secinfo_flags(struct seq_file *m, int flags) { seq_printf(m, ","); show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); } static bool secinfo_flags_equal(int f, int g) { f &= NFSEXP_SECINFO_FLAGS; g &= NFSEXP_SECINFO_FLAGS; return f == g; } static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end) { int flags; flags = (*fp)->flags; seq_printf(m, ",sec=%d", (*fp)->pseudoflavor); (*fp)++; while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) { seq_printf(m, ":%d", (*fp)->pseudoflavor); (*fp)++; } return flags; } static void show_secinfo(struct seq_file *m, struct svc_export *exp) { struct exp_flavor_info *f; struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; int flags; if (exp->ex_nflavors == 0) return; f = exp->ex_flavors; flags = show_secinfo_run(m, &f, end); if (!secinfo_flags_equal(flags, exp->ex_flags)) show_secinfo_flags(m, flags); while (f != end) { flags = show_secinfo_run(m, &f, end); show_secinfo_flags(m, flags); } } static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc) { struct user_namespace *userns = m->file->f_cred->user_ns; show_expflags(m, flag, NFSEXP_ALLFLAGS); if (flag & NFSEXP_FSID) seq_printf(m, ",fsid=%d", fsid); if (!uid_eq(anonu, make_kuid(userns, (uid_t)-2)) && !uid_eq(anonu, make_kuid(userns, 0x10000-2))) seq_printf(m, ",anonuid=%u", from_kuid_munged(userns, anonu)); if (!gid_eq(anong, make_kgid(userns, (gid_t)-2)) && !gid_eq(anong, make_kgid(userns, 0x10000-2))) seq_printf(m, ",anongid=%u", from_kgid_munged(userns, anong)); if (fsloc && fsloc->locations_count > 0) { char *loctype = (fsloc->migrated) ? "refer" : "replicas"; int i; seq_printf(m, ",%s=", loctype); seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\"); seq_putc(m, '@'); seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\"); for (i = 1; i < fsloc->locations_count; i++) { seq_putc(m, ';'); seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\"); seq_putc(m, '@'); seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\"); } } } static int e_show(struct seq_file *m, void *p) { struct cache_head *cp = p; struct svc_export *exp = container_of(cp, struct svc_export, h); struct cache_detail *cd = m->private; bool export_stats = is_export_stats_file(m); if (p == SEQ_START_TOKEN) { seq_puts(m, "# Version 1.1\n"); if (export_stats) seq_puts(m, "# Path Client Start-time\n#\tStats\n"); else seq_puts(m, "# Path Client(Flags) # IPs\n"); return 0; } exp_get(exp); if (cache_check(cd, &exp->h, NULL)) return 0; exp_put(exp); return svc_export_show(m, cd, cp); } const struct seq_operations nfs_exports_op = { .start = cache_seq_start_rcu, .next = cache_seq_next_rcu, .stop = cache_seq_stop_rcu, .show = e_show, }; /* * Initialize the exports module. */ int nfsd_export_init(struct net *net) { int rv; struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("nfsd: initializing export module (net: %x).\n", net->ns.inum); nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net); if (IS_ERR(nn->svc_export_cache)) return PTR_ERR(nn->svc_export_cache); rv = cache_register_net(nn->svc_export_cache, net); if (rv) goto destroy_export_cache; nn->svc_expkey_cache = cache_create_net(&svc_expkey_cache_template, net); if (IS_ERR(nn->svc_expkey_cache)) { rv = PTR_ERR(nn->svc_expkey_cache); goto unregister_export_cache; } rv = cache_register_net(nn->svc_expkey_cache, net); if (rv) goto destroy_expkey_cache; return 0; destroy_expkey_cache: cache_destroy_net(nn->svc_expkey_cache, net); unregister_export_cache: cache_unregister_net(nn->svc_export_cache, net); destroy_export_cache: cache_destroy_net(nn->svc_export_cache, net); return rv; } /* * Flush exports table - called when last nfsd thread is killed */ void nfsd_export_flush(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); cache_purge(nn->svc_expkey_cache); cache_purge(nn->svc_export_cache); } /* * Shutdown the exports module. */ void nfsd_export_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("nfsd: shutting down export module (net: %x).\n", net->ns.inum); cache_unregister_net(nn->svc_expkey_cache, net); cache_unregister_net(nn->svc_export_cache, net); cache_destroy_net(nn->svc_expkey_cache, net); cache_destroy_net(nn->svc_export_cache, net); svcauth_unix_purge(net); dprintk("nfsd: export shutdown complete (net: %x).\n", net->ns.inum); } |
373 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Device core Trace Support * Copyright (C) 2021, Intel Corporation * * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM dev #if !defined(__DEV_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define __DEV_TRACE_H #include <linux/device.h> #include <linux/tracepoint.h> #include <linux/types.h> DECLARE_EVENT_CLASS(devres, TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size), TP_ARGS(dev, op, node, name, size), TP_STRUCT__entry( __string(devname, dev_name(dev)) __field(struct device *, dev) __field(const char *, op) __field(void *, node) __field(const char *, name) __field(size_t, size) ), TP_fast_assign( __assign_str(devname, dev_name(dev)); __entry->op = op; __entry->node = node; __entry->name = name; __entry->size = size; ), TP_printk("%s %3s %p %s (%zu bytes)", __get_str(devname), __entry->op, __entry->node, __entry->name, __entry->size) ); DEFINE_EVENT(devres, devres_log, TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size), TP_ARGS(dev, op, node, name, size) ); #endif /* __DEV_TRACE_H */ /* this part has to be here */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> |
45 46 110 67 46 22 3 3 183 9 9 172 35 124 15 129 7 7 5 116 3 122 8 9 9 30 24 19 20 20 76 22 55 16 45 14 5 10 10 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 * * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar * (C) Copyright 2003 Red Hat Inc, All Rights Reserved * * Removed page pinning, fix privately mapped COW pages and other cleanups * (C) Copyright 2003, 2004 Jamie Lokier * * Robust futex support started by Ingo Molnar * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * * PI-futex support started by Ingo Molnar and Thomas Gleixner * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> * Copyright (C) IBM Corporation, 2009 * Thanks to Thomas Gleixner for conceptual design and careful reviews. * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. * * "The futexes are also cursed." * "But they come in a choice of three flavours!" */ #include <linux/compat.h> #include <linux/jhash.h> #include <linux/pagemap.h> #include <linux/plist.h> #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> #include "futex.h" #include "../locking/rtmutex_common.h" /* * The base of the bucket array and its size are always used together * (after initialization only in futex_hash()), so ensure that they * reside in the same cacheline. */ static struct { struct futex_hash_bucket *queues; unsigned long hashsize; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_queues (__futex_data.queues) #define futex_hashsize (__futex_data.hashsize) /* * Fault injections for futexes. */ #ifdef CONFIG_FAIL_FUTEX static struct { struct fault_attr attr; bool ignore_private; } fail_futex = { .attr = FAULT_ATTR_INITIALIZER, .ignore_private = false, }; static int __init setup_fail_futex(char *str) { return setup_fault_attr(&fail_futex.attr, str); } __setup("fail_futex=", setup_fail_futex); bool should_fail_futex(bool fshared) { if (fail_futex.ignore_private && !fshared) return false; return should_fail(&fail_futex.attr, 1); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_futex_debugfs(void) { umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_futex", NULL, &fail_futex.attr); if (IS_ERR(dir)) return PTR_ERR(dir); debugfs_create_bool("ignore-private", mode, dir, &fail_futex.ignore_private); return 0; } late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ #endif /* CONFIG_FAIL_FUTEX */ /** * futex_hash - Return the hash bucket in the global hash * @key: Pointer to the futex key for which the hash is calculated * * We hash on the keys returned from get_futex_key (see below) and return the * corresponding hash bucket in the global hash. */ struct futex_hash_bucket *futex_hash(union futex_key *key) { u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); return &futex_queues[hash & (futex_hashsize - 1)]; } /** * futex_setup_timer - set up the sleeping hrtimer. * @time: ptr to the given timeout value * @timeout: the hrtimer_sleeper structure to be set up * @flags: futex flags * @range_ns: optional range in ns * * Return: Initialized hrtimer_sleeper structure or NULL if no timeout * value given */ struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns) { if (!time) return NULL; hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is * effectively the same as calling hrtimer_set_expires(). */ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); return timeout; } /* * Generate a machine wide unique identifier for this inode. * * This relies on u64 not wrapping in the life-time of the machine; which with * 1ns resolution means almost 585 years. * * This further relies on the fact that a well formed program will not unmap * the file while it has a (shared) futex waiting on it. This mapping will have * a file reference which pins the mount and inode. * * If for some reason an inode gets evicted and read back in again, it will get * a new sequence number and will _NOT_ match, even though it is the exact same * file. * * It is important that futex_match() will never have a false-positive, esp. * for PI futexes that can mess up the state. The above argues that false-negatives * are only possible for malformed programs. */ static u64 get_inode_sequence_number(struct inode *inode) { static atomic64_t i_seq; u64 old; /* Does the inode already have a sequence number? */ old = atomic64_read(&inode->i_sequence); if (likely(old)) return old; for (;;) { u64 new = atomic64_add_return(1, &i_seq); if (WARN_ON_ONCE(!new)) continue; old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); if (old) return old; return new; } } /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @flags: FLAGS_* * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) * * Return: a negative error code or 0 * * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: * * ( inode->i_sequence, page->index, offset_within_page ) * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct page *page; struct folio *folio; struct address_space *mapping; int err, ro = 0; bool fshared; fshared = flags & FLAGS_SHARED; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; if (unlikely((address % sizeof(u32)) != 0)) return -EINVAL; address -= key->both.offset; if (unlikely(!access_ok(uaddr, sizeof(u32)))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs * virtual address, we dont even have to find the underlying vma. * Note : We do have to check 'uaddr' is a valid user address, * but access_ok() should be faster than find_vma() */ if (!fshared) { /* * On no-MMU, shared futexes are treated as private, therefore * we must not include the current process in the key. Since * there is only one address space, the address is a unique key * on its own. */ if (IS_ENABLED(CONFIG_MMU)) key->private.mm = mm; else key->private.mm = NULL; key->private.address = address; return 0; } again: /* Ignore any VERIFY_READ mapping (futex common case) */ if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. */ if (err == -EFAULT && rw == FUTEX_READ) { err = get_user_pages_fast(address, 1, 0, &page); ro = 1; } if (err < 0) return err; else err = 0; /* * The treatment of mapping from this point on is critical. The folio * lock protects many things but in this context the folio lock * stabilizes mapping, prevents inode freeing in the shared * file-backed region case and guards against movement to swap cache. * * Strictly speaking the folio lock is not needed in all cases being * considered here and folio lock forces unnecessarily serialization. * From this point on, mapping will be re-verified if necessary and * folio lock will be acquired only if it is unavoidable * * Mapping checks require the folio so it is looked up now. For * anonymous pages, it does not matter if the folio is split * in the future as the key is based on the address. For * filesystem-backed pages, the precise page is required as the * index of the page determines the key. */ folio = page_folio(page); mapping = READ_ONCE(folio->mapping); /* * If folio->mapping is NULL, then it cannot be an anonymous * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast * found it, but truncated or holepunched or subjected to * invalidate_complete_page2 before we got the folio lock (also * cases which we are happy to fail). And we hold a reference, * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for folio->mapping. */ if (unlikely(!mapping)) { int shmem_swizzled; /* * Folio lock is required to identify which special case above * applies. If this is really a shmem page then the folio lock * will prevent unexpected transitions. */ folio_lock(folio); shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; folio_unlock(folio); folio_put(folio); if (shmem_swizzled) goto again; return -EFAULT; } /* * Private mappings are handled in a simple way. * * If the futex key is stored in anonymous memory, then the associated * object is the mm which is implicitly pinned by the calling process. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ if (folio_test_anon(folio)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { struct inode *inode; /* * The associated futex object in this case is the inode and * the folio->mapping must be traversed. Ordinarily this should * be stabilised under folio lock but it's not strictly * necessary in this case as we just want to pin the inode, not * update i_pages or anything like that. * * The RCU read lock is taken as the inode is finally freed * under RCU. If the mapping still matches expectations then the * mapping->host can be safely accessed as being a valid inode. */ rcu_read_lock(); if (READ_ONCE(folio->mapping) != mapping) { rcu_read_unlock(); folio_put(folio); goto again; } inode = READ_ONCE(mapping->host); if (!inode) { rcu_read_unlock(); folio_put(folio); goto again; } key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = folio->index + folio_page_idx(folio, page); rcu_read_unlock(); } out: folio_put(folio); return err; } /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address * * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. */ int fault_in_user_writeable(u32 __user *uaddr) { struct mm_struct *mm = current->mm; int ret; mmap_read_lock(mm); ret = fixup_user_fault(mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(mm); return ret < 0 ? ret : 0; } /** * futex_top_waiter() - Return the highest priority waiter on a futex * @hb: the hash bucket the futex_q's reside in * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) { struct futex_q *this; plist_for_each_entry(this, &hb->chain, list) { if (futex_match(&this->key, key)) return this; } return NULL; } int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) { int ret; pagefault_disable(); ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); return ret; } int futex_get_value_locked(u32 *dest, u32 __user *from) { int ret; pagefault_disable(); ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; } /** * wait_for_owner_exiting - Block until the owner has exited * @ret: owner's current futex lock status * @exiting: Pointer to the exiting task * * Caller must hold a refcount on @exiting. */ void wait_for_owner_exiting(int ret, struct task_struct *exiting) { if (ret != -EBUSY) { WARN_ON_ONCE(exiting); return; } if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) return; mutex_lock(&exiting->futex_exit_mutex); /* * No point in doing state checking here. If the waiter got here * while the task was in exec()->exec_futex_release() then it can * have any FUTEX_STATE_* value when the waiter has acquired the * mutex. OK, if running, EXITING or DEAD if it reached exit() * already. Highly unlikely and not a problem. Just one more round * through the futex maze. */ mutex_unlock(&exiting->futex_exit_mutex); put_task_struct(exiting); } /** * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be NULL and must be held by the caller. */ void __futex_unqueue(struct futex_q *q) { struct futex_hash_bucket *hb; if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) return; lockdep_assert_held(q->lock_ptr); hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); futex_hb_waiters_dec(hb); } /* The key must be already stored in q->key. */ struct futex_hash_bucket *futex_q_lock(struct futex_q *q) __acquires(&hb->lock) { struct futex_hash_bucket *hb; hb = futex_hash(&q->key); /* * Increment the counter before taking the lock so that * a potential waker won't miss a to-be-slept task that is * waiting for the spinlock. This is safe as all futex_q_lock() * users end up calling futex_queue(). Similarly, for housekeeping, * decrement the counter at futex_q_unlock() when some error has * occurred and we don't end up adding the task to the list. */ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ q->lock_ptr = &hb->lock; spin_lock(&hb->lock); return hb; } void futex_q_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); futex_hb_waiters_dec(hb); } void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; /* * The priority used to register this element is * - either the real thread-priority for the real-time threads * (i.e. threads with a priority lower than MAX_RT_PRIO) * - or MAX_RT_PRIO for non-RT threads. * Thus, all RT-threads are woken first in priority order, and * the others are woken last, in FIFO order. */ prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = current; } /** * futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must * be paired with exactly one earlier call to futex_queue(). * * Return: * - 1 - if the futex_q was still queued (and we removed unqueued it); * - 0 - if the futex_q was already removed by the waking thread */ int futex_unqueue(struct futex_q *q) { spinlock_t *lock_ptr; int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ retry: /* * q->lock_ptr can change between this read and the following spin_lock. * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and * optimizing lock_ptr out of the logic below. */ lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* * q->lock_ptr can change between reading it and * spin_lock(), causing us to take the wrong lock. This * corrects the race condition. * * Reasoning goes like this: if we have the wrong lock, * q->lock_ptr must have changed (maybe several times) * between reading it and the spin_lock(). It can * change again after the spin_lock() but only if it was * already changed before the spin_lock(). It cannot, * however, change back to the original value. Therefore * we can detect whether we acquired the correct lock. */ if (unlikely(lock_ptr != q->lock_ptr)) { spin_unlock(lock_ptr); goto retry; } __futex_unqueue(q); BUG_ON(q->pi_state); spin_unlock(lock_ptr); ret = 1; } return ret; } /* * PI futexes can not be requeued and must remove themselves from the hash * bucket. The hash bucket lock (i.e. lock_ptr) is held. */ void futex_unqueue_pi(struct futex_q *q) { /* * If the lock was not acquired (due to timeout or signal) then the * rt_waiter is removed before futex_q is. If this is observed by * an unlocker after dropping the rtmutex wait lock and before * acquiring the hash bucket lock, then the unlocker dequeues the * futex_q from the hash bucket list to guarantee consistent state * vs. userspace. Therefore the dequeue here must be conditional. */ if (!plist_node_empty(&q->list)) __futex_unqueue(q); BUG_ON(!q->pi_state); put_pi_state(q->pi_state); q->pi_state = NULL; } /* Constants for the pending_op argument of handle_futex_death */ #define HANDLE_DEATH_PENDING true #define HANDLE_DEATH_LIST false /* * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { u32 uval, nval, mval; pid_t owner; int err; /* Futex address must be 32bit aligned */ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) return -1; retry: if (get_user(uval, uaddr)) return -1; /* * Special case for regular (non PI) futexes. The unlock path in * user space has two race scenarios: * * 1. The unlock path releases the user space futex value and * before it can execute the futex() syscall to wake up * waiters it is killed. * * 2. A woken up waiter is killed before it can acquire the * futex in user space. * * In the second case, the wake up notification could be generated * by the unlock path in user space after setting the futex value * to zero or by the kernel after setting the OWNER_DIED bit below. * * In both cases the TID validation below prevents a wakeup of * potential waiters which can cause these waiters to block * forever. * * In both cases the following conditions are met: * * 1) task->robust_list->list_op_pending != NULL * @pending_op == true * 2) The owner part of user space futex value == 0 * 3) Regular futex: @pi == false * * If these conditions are met, it is safe to attempt waking up a * potential waiter without touching the user space futex value and * trying to set the OWNER_DIED bit. If the futex value is zero, * the rest of the user space mutex state is consistent, so a woken * waiter will just take over the uncontended futex. Setting the * OWNER_DIED bit would create inconsistent state and malfunction * of the user space owner died handling. Otherwise, the OWNER_DIED * bit is already set, and the woken waiter is expected to deal with * this. */ owner = uval & FUTEX_TID_MASK; if (pending_op && !pi && !owner) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); return 0; } if (owner != task_pid_vnr(curr)) return 0; /* * Ok, this dying thread is truly holding a futex * of interest. Set the OWNER_DIED bit atomically * via cmpxchg, and if the value had FUTEX_WAITERS * set, wake up a waiter (if any). (We have to do a * futex_wake() even if OWNER_DIED is already set - * to handle the rare but possible case of recursive * thread-death.) The rest of the cleanup is done in * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; /* * We are not holding a lock here, but we want to have * the pagefault_disable/enable() protection because * we want to handle the fault gracefully. If the * access fails we try to fault in the futex with R/W * verification via get_user_pages. get_user() above * does not guarantee R/W access. If that fails we * give up and leave the futex locked. */ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { switch (err) { case -EFAULT: if (fault_in_user_writeable(uaddr)) return -1; goto retry; case -EAGAIN: cond_resched(); goto retry; default: WARN_ON_ONCE(1); return err; } } if (nval != uval) goto retry; /* * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); } return 0; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, unsigned int *pi) { unsigned long uentry; if (get_user(uentry, (unsigned long __user *)head)) return -EFAULT; *entry = (void __user *)(uentry & ~1UL); *pi = uentry & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; unsigned long futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); /* * A pending lock might already be on the list, so * don't process it twice: */ if (entry != pending) { if (handle_futex_death((void __user *)entry + futex_offset, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { handle_futex_death((void __user *)pending + futex_offset, curr, pip, HANDLE_DEATH_PENDING); } } #ifdef CONFIG_COMPAT static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); void __user *uaddr = compat_ptr(base + futex_offset); return uaddr; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, compat_uptr_t __user *head, unsigned int *pi) { if (get_user(*uentry, head)) return -EFAULT; *entry = compat_ptr((*uentry) & ~1); *pi = (unsigned int)(*uentry) & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (compat_fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != (struct robust_list __user *) &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = compat_fetch_robust_entry(&next_uentry, &next_entry, (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: */ if (entry != pending) { void __user *uaddr = futex_uaddr(entry, futex_offset); if (handle_futex_death(uaddr, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; uentry = next_uentry; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { void __user *uaddr = futex_uaddr(pending, futex_offset); handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); } } #endif #ifdef CONFIG_FUTEX_PI /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */ static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = futex_hash(&key); /* * We can race against put_pi_state() removing itself from the * list (a waiter going away). put_pi_state() will first * decrement the reference count and then modify the list, so * its possible to see the list entry but fail this reference * acquire. * * In that case; drop the locks to let put_pi_state() make * progress and retry the loop. */ if (!refcount_inc_not_zero(&pi_state->refcount)) { raw_spin_unlock_irq(&curr->pi_lock); cpu_relax(); raw_spin_lock_irq(&curr->pi_lock); continue; } raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); put_pi_state(pi_state); continue; } WARN_ON(pi_state->owner != curr); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); } #else static inline void exit_pi_state_list(struct task_struct *curr) { } #endif static void futex_cleanup(struct task_struct *tsk) { if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); tsk->robust_list = NULL; } #ifdef CONFIG_COMPAT if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); tsk->compat_robust_list = NULL; } #endif if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); } /** * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD * @tsk: task to set the state on * * Set the futex exit state of the task lockless. The futex waiter code * observes that state when a task is exiting and loops until the task has * actually finished the futex cleanup. The worst case for this is that the * waiter runs through the wait loop until the state becomes visible. * * This is called from the recursive fault handling path in make_task_dead(). * * This is best effort. Either the futex exit code has run already or * not. If the OWNER_DIED bit has been set on the futex then the waiter can * take it over. If not, the problem is pushed back to user space. If the * futex exit code did not run yet, then an already queued waiter might * block forever, but there is nothing which can be done about that. */ void futex_exit_recursive(struct task_struct *tsk) { /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ if (tsk->futex_state == FUTEX_STATE_EXITING) mutex_unlock(&tsk->futex_exit_mutex); tsk->futex_state = FUTEX_STATE_DEAD; } static void futex_cleanup_begin(struct task_struct *tsk) { /* * Prevent various race issues against a concurrent incoming waiter * including live locks by forcing the waiter to block on * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in * attach_to_pi_owner(). */ mutex_lock(&tsk->futex_exit_mutex); /* * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. * * This ensures that all subsequent checks of tsk->futex_state in * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with * tsk->pi_lock held. * * It guarantees also that a pi_state which was queued right before * the state change under tsk->pi_lock by a concurrent waiter must * be observed in exit_pi_state_list(). */ raw_spin_lock_irq(&tsk->pi_lock); tsk->futex_state = FUTEX_STATE_EXITING; raw_spin_unlock_irq(&tsk->pi_lock); } static void futex_cleanup_end(struct task_struct *tsk, int state) { /* * Lockless store. The only side effect is that an observer might * take another loop until it becomes visible. */ tsk->futex_state = state; /* * Drop the exit protection. This unblocks waiters which observed * FUTEX_STATE_EXITING to reevaluate the state. */ mutex_unlock(&tsk->futex_exit_mutex); } void futex_exec_release(struct task_struct *tsk) { /* * The state handling is done for consistency, but in the case of * exec() there is no way to prevent further damage as the PID stays * the same. But for the unlikely and arguably buggy case that a * futex is held on exec(), this provides at least as much state * consistency protection which is possible. */ futex_cleanup_begin(tsk); futex_cleanup(tsk); /* * Reset the state to FUTEX_STATE_OK. The task is alive and about * exec a new binary. */ futex_cleanup_end(tsk, FUTEX_STATE_OK); } void futex_exit_release(struct task_struct *tsk) { futex_cleanup_begin(tsk); futex_cleanup(tsk); futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } static int __init futex_init(void) { unsigned int futex_shift; unsigned long i; #if CONFIG_BASE_SMALL futex_hashsize = 16; #else futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); #endif futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), futex_hashsize, 0, 0, &futex_shift, NULL, futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; for (i = 0; i < futex_hashsize; i++) { atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } return 0; } core_initcall(futex_init); |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "clock.h" #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/preempt.h> static inline long io_timer_cmp(io_timer_heap *h, struct io_timer *l, struct io_timer *r) { return l->expire - r->expire; } void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) { size_t i; spin_lock(&clock->timer_lock); if (time_after_eq((unsigned long) atomic64_read(&clock->now), timer->expire)) { spin_unlock(&clock->timer_lock); timer->fn(timer); return; } for (i = 0; i < clock->timers.used; i++) if (clock->timers.data[i] == timer) goto out; BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); out: spin_unlock(&clock->timer_lock); } void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) { size_t i; spin_lock(&clock->timer_lock); for (i = 0; i < clock->timers.used; i++) if (clock->timers.data[i] == timer) { heap_del(&clock->timers, i, io_timer_cmp, NULL); break; } spin_unlock(&clock->timer_lock); } struct io_clock_wait { struct io_timer io_timer; struct timer_list cpu_timer; struct task_struct *task; int expired; }; static void io_clock_wait_fn(struct io_timer *timer) { struct io_clock_wait *wait = container_of(timer, struct io_clock_wait, io_timer); wait->expired = 1; wake_up_process(wait->task); } static void io_clock_cpu_timeout(struct timer_list *timer) { struct io_clock_wait *wait = container_of(timer, struct io_clock_wait, cpu_timer); wait->expired = 1; wake_up_process(wait->task); } void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) { struct io_clock_wait wait; /* XXX: calculate sleep time rigorously */ wait.io_timer.expire = until; wait.io_timer.fn = io_clock_wait_fn; wait.task = current; wait.expired = 0; bch2_io_timer_add(clock, &wait.io_timer); schedule(); bch2_io_timer_del(clock, &wait.io_timer); } void bch2_kthread_io_clock_wait(struct io_clock *clock, unsigned long io_until, unsigned long cpu_timeout) { bool kthread = (current->flags & PF_KTHREAD) != 0; struct io_clock_wait wait; wait.io_timer.expire = io_until; wait.io_timer.fn = io_clock_wait_fn; wait.task = current; wait.expired = 0; bch2_io_timer_add(clock, &wait.io_timer); timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); do { set_current_state(TASK_INTERRUPTIBLE); if (kthread && kthread_should_stop()) break; if (wait.expired) break; schedule(); try_to_freeze(); } while (0); __set_current_state(TASK_RUNNING); del_timer_sync(&wait.cpu_timer); destroy_timer_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); } static struct io_timer *get_expired_timer(struct io_clock *clock, unsigned long now) { struct io_timer *ret = NULL; spin_lock(&clock->timer_lock); if (clock->timers.used && time_after_eq(now, clock->timers.data[0]->expire)) heap_pop(&clock->timers, ret, io_timer_cmp, NULL); spin_unlock(&clock->timer_lock); return ret; } void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) { struct io_timer *timer; unsigned long now = atomic64_add_return(sectors, &clock->now); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); } void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) { unsigned long now; unsigned i; out->atomic++; spin_lock(&clock->timer_lock); now = atomic64_read(&clock->now); for (i = 0; i < clock->timers.used; i++) prt_printf(out, "%ps:\t%li\n", clock->timers.data[i]->fn, clock->timers.data[i]->expire - now); spin_unlock(&clock->timer_lock); --out->atomic; } void bch2_io_clock_exit(struct io_clock *clock) { free_heap(&clock->timers); free_percpu(clock->pcpu_buf); } int bch2_io_clock_init(struct io_clock *clock) { atomic64_set(&clock->now, 0); spin_lock_init(&clock->timer_lock); clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); if (!clock->pcpu_buf) return -BCH_ERR_ENOMEM_io_clock_init; if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) return -BCH_ERR_ENOMEM_io_clock_init; return 0; } |
9 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_X86_SGX_H #define __KVM_X86_SGX_H #include <linux/kvm_host.h> #include "capabilities.h" #include "vmx_ops.h" #ifdef CONFIG_X86_SGX_KVM extern bool __read_mostly enable_sgx; int handle_encls(struct kvm_vcpu *vcpu); void setup_default_sgx_lepubkeyhash(void); void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu); void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); #else #define enable_sgx 0 static inline void setup_default_sgx_lepubkeyhash(void) { } static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { } static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { /* Nothing to do if hardware doesn't support SGX */ if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); } #endif #endif /* __KVM_X86_SGX_H */ |
6 6 6 6 6 6 6 6 6 6 6 6 6 6 80 80 3 3 6 6 6 3 3 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | /* * Rusty Russell (C)2000 -- This code is GPL. * Patrick McHardy (c) 2006-2012 */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <net/protocol.h> #include <net/netfilter/nf_queue.h> #include <net/dst.h> #include "nf_internals.h" static const struct nf_queue_handler __rcu *nf_queue_handler; /* * Hook for nfnetlink_queue to register its queue handler. * We do this so that most of the NFQUEUE code can be modular. * * Once the queue is registered it must reinject all packets it * receives, no matter what. */ void nf_register_queue_handler(const struct nf_queue_handler *qh) { /* should never happen, we only have one queueing backend in kernel */ WARN_ON(rcu_access_pointer(nf_queue_handler)); rcu_assign_pointer(nf_queue_handler, qh); } EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ void nf_unregister_queue_handler(void) { RCU_INIT_POINTER(nf_queue_handler, NULL); } EXPORT_SYMBOL(nf_unregister_queue_handler); static void nf_queue_sock_put(struct sock *sk) { #ifdef CONFIG_INET sock_gen_put(sk); #else sock_put(sk); #endif } static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; /* Release those devices we held, or Alexey will kill me. */ dev_put(state->in); dev_put(state->out); if (state->sk) nf_queue_sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_put(entry->physin); dev_put(entry->physout); #endif } void nf_queue_entry_free(struct nf_queue_entry *entry) { nf_queue_entry_release_refs(entry); kfree(entry); } EXPORT_SYMBOL_GPL(nf_queue_entry_free); static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const struct sk_buff *skb = entry->skb; if (nf_bridge_info_exists(skb)) { entry->physin = nf_bridge_get_physindev(skb, entry->state.net); entry->physout = nf_bridge_get_physoutdev(skb); } else { entry->physin = NULL; entry->physout = NULL; } #endif } /* Bump dev refs so they don't vanish while packet is out */ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) return false; dev_hold(state->in); dev_hold(state->out); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_hold(entry->physin); dev_hold(entry->physout); #endif return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); void nf_queue_nf_hook_drop(struct net *net) { const struct nf_queue_handler *qh; rcu_read_lock(); qh = rcu_dereference(nf_queue_handler); if (qh) qh->nf_hook_drop(net); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_queue_nf_hook_drop); static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_queue_entry *entry) { struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); rt_info->tos = iph->tos; rt_info->daddr = iph->daddr; rt_info->saddr = iph->saddr; rt_info->mark = skb->mark; } } static void nf_ip6_saveroute(const struct sk_buff *skb, struct nf_queue_entry *entry) { struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct ipv6hdr *iph = ipv6_hdr(skb); rt_info->daddr = iph->daddr; rt_info->saddr = iph->saddr; rt_info->mark = skb->mark; } } static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, unsigned int index, unsigned int queuenum) { struct nf_queue_entry *entry = NULL; const struct nf_queue_handler *qh; unsigned int route_key_size; int status; /* QUEUE == DROP if no one is waiting, to be safe. */ qh = rcu_dereference(nf_queue_handler); if (!qh) return -ESRCH; switch (state->pf) { case AF_INET: route_key_size = sizeof(struct ip_rt_info); break; case AF_INET6: route_key_size = sizeof(struct ip6_rt_info); break; default: route_key_size = 0; break; } if (skb_sk_is_prefetched(skb)) { struct sock *sk = skb->sk; if (!sk_is_refcounted(sk)) { if (!refcount_inc_not_zero(&sk->sk_refcnt)) return -ENOTCONN; /* drop refcount on skb_orphan */ skb->destructor = sock_edemux; } } entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC); if (!entry) return -ENOMEM; if (skb_dst(skb) && !skb_dst_force(skb)) { kfree(entry); return -ENETDOWN; } *entry = (struct nf_queue_entry) { .skb = skb, .state = *state, .hook_index = index, .size = sizeof(*entry) + route_key_size, }; __nf_queue_entry_init_physdevs(entry); if (!nf_queue_entry_get_refs(entry)) { kfree(entry); return -ENOTCONN; } switch (entry->state.pf) { case AF_INET: nf_ip_saveroute(skb, entry); break; case AF_INET6: nf_ip6_saveroute(skb, entry); break; } status = qh->outfn(entry, queuenum); if (status < 0) { nf_queue_entry_free(entry); return status; } return 0; } /* Packets leaving via this function must come back through nf_reinject(). */ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int index, unsigned int verdict) { int ret; ret = __nf_queue(skb, state, index, verdict >> NF_VERDICT_QBITS); if (ret < 0) { if (ret == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) return 1; kfree_skb(skb); } return 0; } EXPORT_SYMBOL_GPL(nf_queue); |
6 2 2 2 3 6 15 8 16 1 15 14 4 9 1 1 5 7 3 7 1 1 5 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 | // SPDX-License-Identifier: GPL-2.0-only /* * ebt_among * * Authors: * Grzegorz Borowiak <grzes@gnu.univ.gda.pl> * * August, 2003 * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ip.h> #include <linux/if_arp.h> #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_among.h> static bool ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh, const char *mac, __be32 ip) { /* You may be puzzled as to how this code works. * Some tricks were used, refer to * include/linux/netfilter_bridge/ebt_among.h * as there you can find a solution of this mystery. */ const struct ebt_mac_wormhash_tuple *p; int start, limit, i; uint32_t cmp[2] = { 0, 0 }; int key = ((const unsigned char *)mac)[5]; ether_addr_copy(((char *) cmp) + 2, mac); start = wh->table[key]; limit = wh->table[key + 1]; if (ip) { for (i = start; i < limit; i++) { p = &wh->pool[i]; if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0]) if (p->ip == 0 || p->ip == ip) return true; } } else { for (i = start; i < limit; i++) { p = &wh->pool[i]; if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0]) if (p->ip == 0) return true; } } return false; } static int ebt_mac_wormhash_check_integrity(const struct ebt_mac_wormhash *wh) { int i; for (i = 0; i < 256; i++) { if (wh->table[i] > wh->table[i + 1]) return -0x100 - i; if (wh->table[i] < 0) return -0x200 - i; if (wh->table[i] > wh->poolsize) return -0x300 - i; } if (wh->table[256] > wh->poolsize) return -0xc00; return 0; } static int get_ip_dst(const struct sk_buff *skb, __be32 *addr) { if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) { const struct iphdr *ih; struct iphdr _iph; ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (ih == NULL) return -1; *addr = ih->daddr; } else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) { const struct arphdr *ah; struct arphdr _arph; const __be32 *bp; __be32 buf; ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); if (ah == NULL || ah->ar_pln != sizeof(__be32) || ah->ar_hln != ETH_ALEN) return -1; bp = skb_header_pointer(skb, sizeof(struct arphdr) + 2 * ETH_ALEN + sizeof(__be32), sizeof(__be32), &buf); if (bp == NULL) return -1; *addr = *bp; } return 0; } static int get_ip_src(const struct sk_buff *skb, __be32 *addr) { if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) { const struct iphdr *ih; struct iphdr _iph; ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (ih == NULL) return -1; *addr = ih->saddr; } else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) { const struct arphdr *ah; struct arphdr _arph; const __be32 *bp; __be32 buf; ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); if (ah == NULL || ah->ar_pln != sizeof(__be32) || ah->ar_hln != ETH_ALEN) return -1; bp = skb_header_pointer(skb, sizeof(struct arphdr) + ETH_ALEN, sizeof(__be32), &buf); if (bp == NULL) return -1; *addr = *bp; } return 0; } static bool ebt_among_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ebt_among_info *info = par->matchinfo; const char *dmac, *smac; const struct ebt_mac_wormhash *wh_dst, *wh_src; __be32 dip = 0, sip = 0; wh_dst = ebt_among_wh_dst(info); wh_src = ebt_among_wh_src(info); if (wh_src) { smac = eth_hdr(skb)->h_source; if (get_ip_src(skb, &sip)) return false; if (!(info->bitmask & EBT_AMONG_SRC_NEG)) { /* we match only if it contains */ if (!ebt_mac_wormhash_contains(wh_src, smac, sip)) return false; } else { /* we match only if it DOES NOT contain */ if (ebt_mac_wormhash_contains(wh_src, smac, sip)) return false; } } if (wh_dst) { dmac = eth_hdr(skb)->h_dest; if (get_ip_dst(skb, &dip)) return false; if (!(info->bitmask & EBT_AMONG_DST_NEG)) { /* we match only if it contains */ if (!ebt_mac_wormhash_contains(wh_dst, dmac, dip)) return false; } else { /* we match only if it DOES NOT contain */ if (ebt_mac_wormhash_contains(wh_dst, dmac, dip)) return false; } } return true; } static bool poolsize_invalid(const struct ebt_mac_wormhash *w) { return w && w->poolsize >= (INT_MAX / sizeof(struct ebt_mac_wormhash_tuple)); } static bool wormhash_offset_invalid(int off, unsigned int len) { if (off == 0) /* not present */ return false; if (off < (int)sizeof(struct ebt_among_info) || off % __alignof__(struct ebt_mac_wormhash)) return true; off += sizeof(struct ebt_mac_wormhash); return off > len; } static bool wormhash_sizes_valid(const struct ebt_mac_wormhash *wh, int a, int b) { if (a == 0) a = sizeof(struct ebt_among_info); return ebt_mac_wormhash_size(wh) + a == b; } static int ebt_among_mt_check(const struct xt_mtchk_param *par) { const struct ebt_among_info *info = par->matchinfo; const struct ebt_entry_match *em = container_of(par->matchinfo, const struct ebt_entry_match, data); unsigned int expected_length = sizeof(struct ebt_among_info); const struct ebt_mac_wormhash *wh_dst, *wh_src; int err; if (expected_length > em->match_size) return -EINVAL; if (wormhash_offset_invalid(info->wh_dst_ofs, em->match_size) || wormhash_offset_invalid(info->wh_src_ofs, em->match_size)) return -EINVAL; wh_dst = ebt_among_wh_dst(info); if (poolsize_invalid(wh_dst)) return -EINVAL; expected_length += ebt_mac_wormhash_size(wh_dst); if (expected_length > em->match_size) return -EINVAL; wh_src = ebt_among_wh_src(info); if (poolsize_invalid(wh_src)) return -EINVAL; if (info->wh_src_ofs < info->wh_dst_ofs) { if (!wormhash_sizes_valid(wh_src, info->wh_src_ofs, info->wh_dst_ofs)) return -EINVAL; } else { if (!wormhash_sizes_valid(wh_dst, info->wh_dst_ofs, info->wh_src_ofs)) return -EINVAL; } expected_length += ebt_mac_wormhash_size(wh_src); if (em->match_size != EBT_ALIGN(expected_length)) { pr_err_ratelimited("wrong size: %d against expected %d, rounded to %zd\n", em->match_size, expected_length, EBT_ALIGN(expected_length)); return -EINVAL; } if (wh_dst && (err = ebt_mac_wormhash_check_integrity(wh_dst))) { pr_err_ratelimited("dst integrity fail: %x\n", -err); return -EINVAL; } if (wh_src && (err = ebt_mac_wormhash_check_integrity(wh_src))) { pr_err_ratelimited("src integrity fail: %x\n", -err); return -EINVAL; } return 0; } static struct xt_match ebt_among_mt_reg __read_mostly = { .name = "among", .revision = 0, .family = NFPROTO_BRIDGE, .match = ebt_among_mt, .checkentry = ebt_among_mt_check, .matchsize = -1, /* special case */ .me = THIS_MODULE, }; static int __init ebt_among_init(void) { return xt_register_match(&ebt_among_mt_reg); } static void __exit ebt_among_fini(void) { xt_unregister_match(&ebt_among_mt_reg); } module_init(ebt_among_init); module_exit(ebt_among_fini); MODULE_DESCRIPTION("Ebtables: Combined MAC/IP address list matching"); MODULE_LICENSE("GPL"); |
317 315 7 312 12 44 132 4 2 174 174 78 78 78 1 1 1 78 662 648 64 42 8 2 4 2 42 636 283 284 276 284 284 283 356 253 133 13 10 13 13 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 | /* * linux/fs/nls/nls_base.c * * Native language support--charsets and unicode translations. * By Gordon Chaffee 1996, 1997 * * Unicode based case conversion 1999 by Wolfram Pienkoss * */ #include <linux/module.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/kmod.h> #include <linux/spinlock.h> #include <asm/byteorder.h> static struct nls_table default_table; static struct nls_table *tables = &default_table; static DEFINE_SPINLOCK(nls_lock); /* * Sample implementation from Unicode home page. * http://www.stonehand.com/unicode/standard/fss-utf.html */ struct utf8_table { int cmask; int cval; int shift; long lmask; long lval; }; static const struct utf8_table utf8_table[] = { {0x80, 0x00, 0*6, 0x7F, 0, /* 1 byte sequence */}, {0xE0, 0xC0, 1*6, 0x7FF, 0x80, /* 2 byte sequence */}, {0xF0, 0xE0, 2*6, 0xFFFF, 0x800, /* 3 byte sequence */}, {0xF8, 0xF0, 3*6, 0x1FFFFF, 0x10000, /* 4 byte sequence */}, {0xFC, 0xF8, 4*6, 0x3FFFFFF, 0x200000, /* 5 byte sequence */}, {0xFE, 0xFC, 5*6, 0x7FFFFFFF, 0x4000000, /* 6 byte sequence */}, {0, /* end of table */} }; #define UNICODE_MAX 0x0010ffff #define PLANE_SIZE 0x00010000 #define SURROGATE_MASK 0xfffff800 #define SURROGATE_PAIR 0x0000d800 #define SURROGATE_LOW 0x00000400 #define SURROGATE_BITS 0x000003ff int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu) { unsigned long l; int c0, c, nc; const struct utf8_table *t; nc = 0; c0 = *s; l = c0; for (t = utf8_table; t->cmask; t++) { nc++; if ((c0 & t->cmask) == t->cval) { l &= t->lmask; if (l < t->lval || l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR) return -1; *pu = (unicode_t) l; return nc; } if (inlen <= nc) return -1; s++; c = (*s ^ 0x80) & 0xFF; if (c & 0xC0) return -1; l = (l << 6) | c; } return -1; } EXPORT_SYMBOL(utf8_to_utf32); int utf32_to_utf8(unicode_t u, u8 *s, int maxout) { unsigned long l; int c, nc; const struct utf8_table *t; if (!s) return 0; l = u; if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR) return -1; nc = 0; for (t = utf8_table; t->cmask && maxout; t++, maxout--) { nc++; if (l <= t->lmask) { c = t->shift; *s = (u8) (t->cval | (l >> c)); while (c > 0) { c -= 6; s++; *s = (u8) (0x80 | ((l >> c) & 0x3F)); } return nc; } } return -1; } EXPORT_SYMBOL(utf32_to_utf8); static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian) { switch (endian) { default: *s = (wchar_t) c; break; case UTF16_LITTLE_ENDIAN: *s = __cpu_to_le16(c); break; case UTF16_BIG_ENDIAN: *s = __cpu_to_be16(c); break; } } int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian, wchar_t *pwcs, int maxout) { u16 *op; int size; unicode_t u; op = pwcs; while (inlen > 0 && maxout > 0 && *s) { if (*s & 0x80) { size = utf8_to_utf32(s, inlen, &u); if (size < 0) return -EINVAL; s += size; inlen -= size; if (u >= PLANE_SIZE) { if (maxout < 2) break; u -= PLANE_SIZE; put_utf16(op++, SURROGATE_PAIR | ((u >> 10) & SURROGATE_BITS), endian); put_utf16(op++, SURROGATE_PAIR | SURROGATE_LOW | (u & SURROGATE_BITS), endian); maxout -= 2; } else { put_utf16(op++, u, endian); maxout--; } } else { put_utf16(op++, *s++, endian); inlen--; maxout--; } } return op - pwcs; } EXPORT_SYMBOL(utf8s_to_utf16s); static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian) { switch (endian) { default: return c; case UTF16_LITTLE_ENDIAN: return __le16_to_cpu(c); case UTF16_BIG_ENDIAN: return __be16_to_cpu(c); } } int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian, u8 *s, int maxout) { u8 *op; int size; unsigned long u, v; op = s; while (inlen > 0 && maxout > 0) { u = get_utf16(*pwcs, endian); if (!u) break; pwcs++; inlen--; if (u > 0x7f) { if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { if (u & SURROGATE_LOW) { /* Ignore character and move on */ continue; } if (inlen <= 0) break; v = get_utf16(*pwcs, endian); if ((v & SURROGATE_MASK) != SURROGATE_PAIR || !(v & SURROGATE_LOW)) { /* Ignore character and move on */ continue; } u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) + (v & SURROGATE_BITS); pwcs++; inlen--; } size = utf32_to_utf8(u, op, maxout); if (size == -1) { /* Ignore character and move on */ } else { op += size; maxout -= size; } } else { *op++ = (u8) u; maxout--; } } return op - s; } EXPORT_SYMBOL(utf16s_to_utf8s); int __register_nls(struct nls_table *nls, struct module *owner) { struct nls_table ** tmp = &tables; if (nls->next) return -EBUSY; nls->owner = owner; spin_lock(&nls_lock); while (*tmp) { if (nls == *tmp) { spin_unlock(&nls_lock); return -EBUSY; } tmp = &(*tmp)->next; } nls->next = tables; tables = nls; spin_unlock(&nls_lock); return 0; } EXPORT_SYMBOL(__register_nls); int unregister_nls(struct nls_table * nls) { struct nls_table ** tmp = &tables; spin_lock(&nls_lock); while (*tmp) { if (nls == *tmp) { *tmp = nls->next; spin_unlock(&nls_lock); return 0; } tmp = &(*tmp)->next; } spin_unlock(&nls_lock); return -EINVAL; } static struct nls_table *find_nls(const char *charset) { struct nls_table *nls; spin_lock(&nls_lock); for (nls = tables; nls; nls = nls->next) { if (!strcmp(nls->charset, charset)) break; if (nls->alias && !strcmp(nls->alias, charset)) break; } if (nls && !try_module_get(nls->owner)) nls = NULL; spin_unlock(&nls_lock); return nls; } struct nls_table *load_nls(const char *charset) { return try_then_request_module(find_nls(charset), "nls_%s", charset); } void unload_nls(struct nls_table *nls) { if (nls) module_put(nls->owner); } static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* 0x90*/ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* 0xa0*/ 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* 0xb0*/ 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, /* 0xc0*/ 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* 0xd0*/ 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* 0xe0*/ 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* 0xf0*/ 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00 }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table default_table = { .charset = "default", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; /* Returns a simple default translation table */ struct nls_table *load_nls_default(void) { struct nls_table *default_nls; default_nls = load_nls(CONFIG_NLS_DEFAULT); if (default_nls != NULL) return default_nls; else return &default_table; } EXPORT_SYMBOL(unregister_nls); EXPORT_SYMBOL(unload_nls); EXPORT_SYMBOL(load_nls); EXPORT_SYMBOL(load_nls_default); MODULE_LICENSE("Dual BSD/GPL"); |
74 74 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | // SPDX-License-Identifier: GPL-2.0-or-later /* mpih-rshift.c - MPI helper functions * Copyright (C) 1994, 1996, 1998, 1999, * 2000, 2001 Free Software Foundation, Inc. * * This file is part of GNUPG * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" /* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right * and store the USIZE least significant limbs of the result at WP. * The bits shifted out to the right are returned. * * Argument constraints: * 1. 0 < CNT < BITS_PER_MP_LIMB * 2. If the result is to be written over the input, WP must be <= UP. */ mpi_limb_t mpihelp_rshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, unsigned cnt) { mpi_limb_t high_limb, low_limb; unsigned sh_1, sh_2; mpi_size_t i; mpi_limb_t retval; sh_1 = cnt; wp -= 1; sh_2 = BITS_PER_MPI_LIMB - sh_1; high_limb = up[0]; retval = high_limb << sh_2; low_limb = high_limb; for (i = 1; i < usize; i++) { high_limb = up[i]; wp[i] = (low_limb >> sh_1) | (high_limb << sh_2); low_limb = high_limb; } wp[i] = low_limb >> sh_1; return retval; } |
4 107 103 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_ACT_API_H #define __NET_ACT_API_H /* * Public action API for classifiers/qdiscs */ #include <linux/refcount.h> #include <net/flow_offload.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/net_namespace.h> #include <net/netns/generic.h> struct tcf_idrinfo { struct mutex lock; struct idr action_idr; struct net *net; }; struct tc_action_ops; struct tc_action { const struct tc_action_ops *ops; __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ struct tcf_idrinfo *idrinfo; u32 tcfa_index; refcount_t tcfa_refcnt; atomic_t tcfa_bindcnt; int tcfa_action; struct tcf_t tcfa_tm; struct gnet_stats_basic_sync tcfa_bstats; struct gnet_stats_basic_sync tcfa_bstats_hw; struct gnet_stats_queue tcfa_qstats; struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_basic_sync __percpu *cpu_bstats_hw; struct gnet_stats_queue __percpu *cpu_qstats; struct tc_cookie __rcu *user_cookie; struct tcf_chain __rcu *goto_chain; u32 tcfa_flags; u8 hw_stats; u8 used_hw_stats; bool used_hw_stats_valid; u32 in_hw_count; }; #define tcf_index common.tcfa_index #define tcf_refcnt common.tcfa_refcnt #define tcf_bindcnt common.tcfa_bindcnt #define tcf_action common.tcfa_action #define tcf_tm common.tcfa_tm #define tcf_bstats common.tcfa_bstats #define tcf_qstats common.tcfa_qstats #define tcf_rate_est common.tcfa_rate_est #define tcf_lock common.tcfa_lock #define TCA_ACT_HW_STATS_ANY (TCA_ACT_HW_STATS_IMMEDIATE | \ TCA_ACT_HW_STATS_DELAYED) /* Reserve 16 bits for user-space. See TCA_ACT_FLAGS_NO_PERCPU_STATS. */ #define TCA_ACT_FLAGS_USER_BITS 16 #define TCA_ACT_FLAGS_USER_MASK 0xffff #define TCA_ACT_FLAGS_POLICE (1U << TCA_ACT_FLAGS_USER_BITS) #define TCA_ACT_FLAGS_BIND (1U << (TCA_ACT_FLAGS_USER_BITS + 1)) #define TCA_ACT_FLAGS_REPLACE (1U << (TCA_ACT_FLAGS_USER_BITS + 2)) #define TCA_ACT_FLAGS_NO_RTNL (1U << (TCA_ACT_FLAGS_USER_BITS + 3)) #define TCA_ACT_FLAGS_AT_INGRESS (1U << (TCA_ACT_FLAGS_USER_BITS + 4)) /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. */ static inline void tcf_lastuse_update(struct tcf_t *tm) { unsigned long now = jiffies; if (tm->lastuse != now) tm->lastuse = now; if (unlikely(!tm->firstuse)) tm->firstuse = now; } static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) { dtm->install = jiffies_to_clock_t(jiffies - stm->install); dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse); dtm->firstuse = stm->firstuse ? jiffies_to_clock_t(jiffies - stm->firstuse) : 0; dtm->expires = jiffies_to_clock_t(stm->expires); } static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) { if (WARN_ON_ONCE(hw_stats > TCA_ACT_HW_STATS_ANY)) return FLOW_ACTION_HW_STATS_DONT_CARE; else if (!hw_stats) return FLOW_ACTION_HW_STATS_DISABLED; return hw_stats; } typedef void (*tc_action_priv_destructor)(void *priv); struct tc_action_ops { struct list_head head; char kind[IFNAMSIZ]; enum tca_id id; /* identifier should match kind */ unsigned int net_id; size_t size; struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, struct tcf_result *); /* called under RCU BH lock*/ int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *); int (*lookup)(struct net *net, struct tc_action **a, u32 index); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack); int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *, struct netlink_ext_ack *); void (*stats_update)(struct tc_action *, u64, u64, u64, u64, bool); size_t (*get_fill_size)(const struct tc_action *act); struct net_device *(*get_dev)(const struct tc_action *a, tc_action_priv_destructor *destructor); struct psample_group * (*get_psample_group)(const struct tc_action *a, tc_action_priv_destructor *destructor); int (*offload_act_setup)(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack); }; #ifdef CONFIG_NET_CLS_ACT #define ACT_P_BOUND 0 #define ACT_P_CREATED 1 #define ACT_P_DELETED 1 struct tc_action_net { struct tcf_idrinfo *idrinfo; const struct tc_action_ops *ops; }; static inline int tc_action_net_init(struct net *net, struct tc_action_net *tn, const struct tc_action_ops *ops) { int err = 0; tn->idrinfo = kmalloc(sizeof(*tn->idrinfo), GFP_KERNEL); if (!tn->idrinfo) return -ENOMEM; tn->ops = ops; tn->idrinfo->net = net; mutex_init(&tn->idrinfo->lock); idr_init(&tn->idrinfo->action_idr); return err; } void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo); static inline void tc_action_net_exit(struct list_head *net_list, unsigned int id) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { struct tc_action_net *tn = net_generic(net, id); tcf_idrinfo_destroy(tn->ops, tn->idrinfo); kfree(tn->idrinfo); } rtnl_unlock(); } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack); int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index); int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats, u32 flags); int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags); void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); int tcf_idr_release(struct tc_action *a, bool bind); int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); #define NET_ACT_ALIAS_PREFIX "net-act-" #define MODULE_ALIAS_NET_ACT(kind) MODULE_ALIAS(NET_ACT_ALIAS_PREFIX kind) int tcf_action_destroy(struct tc_action *actions[], int bind); int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action *actions[], int init_res[], size_t *attr_size, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action_ops *a_o, int *init_res, u32 flags, struct netlink_ext_ack *extack); int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); static inline void tcf_action_update_bstats(struct tc_action *a, struct sk_buff *skb) { if (likely(a->cpu_bstats)) { bstats_update(this_cpu_ptr(a->cpu_bstats), skb); return; } spin_lock(&a->tcfa_lock); bstats_update(&a->tcfa_bstats, skb); spin_unlock(&a->tcfa_lock); } static inline void tcf_action_inc_drop_qstats(struct tc_action *a) { if (likely(a->cpu_qstats)) { qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); return; } spin_lock(&a->tcfa_lock); qstats_drop_inc(&a->tcfa_qstats); spin_unlock(&a->tcfa_lock); } static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) { if (likely(a->cpu_qstats)) { qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); return; } spin_lock(&a->tcfa_lock); qstats_overlimit_inc(&a->tcfa_qstats); spin_unlock(&a->tcfa_lock); } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, u64 drops, bool hw); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); int tcf_action_update_hw_stats(struct tc_action *action); int tcf_action_reoffload_cb(flow_indr_block_bind_cb_t *cb, void *cb_priv, bool add); int tcf_action_check_ctrlact(int action, struct tcf_proto *tp, struct tcf_chain **handle, struct netlink_ext_ack *newchain); struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, struct tcf_chain *newchain); #ifdef CONFIG_INET DECLARE_STATIC_KEY_FALSE(tcf_frag_xmit_count); #endif int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)); #else /* !CONFIG_NET_CLS_ACT */ static inline int tcf_action_reoffload_cb(flow_indr_block_bind_cb_t *cb, void *cb_priv, bool add) { return 0; } #endif /* CONFIG_NET_CLS_ACT */ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { #ifdef CONFIG_NET_CLS_ACT if (!a->ops->stats_update) return; a->ops->stats_update(a, bytes, packets, drops, lastuse, hw); #endif } #endif |
4 4 4 4 4 4 4 4 2 2 28 2 26 26 26 26 1 1 1 1 1 11 1 10 10 7 4 4 10 10 11 11 4 4 11 2 4 1 1 1 1 4 4 1 4 4 4 4 1 4 1 4 4 4 4 4 4 4 11 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 10 10 8 10 10 10 10 8 9 11 11 11 11 7 11 11 9 9 11 11 10 7 4 11 7 7 11 10 11 3 1 2 4 4 10 4 4 10 4 4 4 4 4 4 4 4 4 1 1 1 1 1 5 8 5 11 11 11 11 11 11 11 11 11 5 7 7 5 4 8 8 8 8 5 3 8 6 5 3 4 4 4 5 4 4 25 25 1 1 1 1 3 3 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 9 4 4 4 4 4 4 4 4 4 4 16 16 1 16 4 4 10 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 7 3 10 10 7 3 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 18 10 10 10 10 10 10 10 9 10 3 7 10 10 10 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 | // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/segment.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> #include <linux/prefetch.h> #include <linux/kthread.h> #include <linux/swap.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/sched/signal.h> #include <linux/random.h> #include "f2fs.h" #include "segment.h" #include "node.h" #include "gc.h" #include "iostat.h" #include <trace/events/f2fs.h> #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { unsigned long tmp = 0; int shift = 24, idx = 0; #if BITS_PER_LONG == 64 shift = 56; #endif while (shift >= 0) { tmp |= (unsigned long)str[idx++] << shift; shift -= BITS_PER_BYTE; } return tmp; } /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since * MSB and LSB are reversed in a byte by f2fs_set_bit. */ static inline unsigned long __reverse_ffs(unsigned long word) { int num = 0; #if BITS_PER_LONG == 64 if ((word & 0xffffffff00000000UL) == 0) num += 32; else word >>= 32; #endif if ((word & 0xffff0000) == 0) num += 16; else word >>= 16; if ((word & 0xff00) == 0) num += 8; else word >>= 8; if ((word & 0xf0) == 0) num += 4; else word >>= 4; if ((word & 0xc) == 0) num += 2; else word >>= 2; if ((word & 0x2) == 0) num += 1; return num; } /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. * @size must be integral times of unsigned long. * Example: * MSB <--> LSB * f2fs_set_bit(0, bitmap) => 1000 0000 * f2fs_set_bit(7, bitmap) => 0000 0001 */ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = size; unsigned long tmp; if (offset >= size) return size; size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; while (1) { if (*p == 0) goto pass; tmp = __reverse_ulong((unsigned char *)p); tmp &= ~0UL >> offset; if (size < BITS_PER_LONG) tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) goto found; pass: if (size <= BITS_PER_LONG) break; size -= BITS_PER_LONG; offset = 0; p++; } return result; found: return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = size; unsigned long tmp; if (offset >= size) return size; size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; while (1) { if (*p == ~0UL) goto pass; tmp = __reverse_ulong((unsigned char *)p); if (offset) tmp |= ~0UL << (BITS_PER_LONG - offset); if (size < BITS_PER_LONG) tmp |= ~0UL >> size; if (tmp != ~0UL) goto found; pass: if (size <= BITS_PER_LONG) break; size -= BITS_PER_LONG; offset = 0; p++; } return result; found: return result - size + __reverse_ffz(tmp); } bool f2fs_need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (f2fs_lfs_mode(sbi)) return false; if (sbi->gc_mode == GC_URGENT_HIGH) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } void f2fs_abort_atomic_write(struct inode *inode, bool clean) { struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_is_atomic_file(inode)) return; if (clean) truncate_inode_pages_final(inode->i_mapping); release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_COMMITTED); clear_inode_flag(inode, FI_ATOMIC_REPLACE); clear_inode_flag(inode, FI_ATOMIC_FILE); stat_dec_atomic_inode(inode); F2FS_I(inode)->atomic_write_task = NULL; if (clean) { f2fs_i_size_write(inode, fi->original_i_size); fi->original_i_size = 0; } /* avoid stale dirty inode during eviction */ sync_inode_metadata(inode, 0); } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, block_t new_addr, block_t *old_addr, bool recover) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; struct node_info ni; int err; retry: set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) { if (err == -ENOMEM) { f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } return err; } err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) { f2fs_put_dnode(&dn); return err; } if (recover) { /* dn.data_blkaddr is always valid */ if (!__is_valid_data_blkaddr(new_addr)) { if (new_addr == NULL_ADDR) dec_valid_block_count(sbi, inode, 1); f2fs_invalidate_blocks(sbi, dn.data_blkaddr); f2fs_update_data_blkaddr(&dn, new_addr); } else { f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, ni.version, true, true); } } else { blkcnt_t count = 1; err = inc_valid_block_count(sbi, inode, &count, true); if (err) { f2fs_put_dnode(&dn); return err; } *old_addr = dn.data_blkaddr; f2fs_truncate_data_blocks_range(&dn, 1); dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, ni.version, true, false); } f2fs_put_dnode(&dn); trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode, index, old_addr ? *old_addr : 0, new_addr, recover); return 0; } static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; pgoff_t start_index = 0; bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { if (revoke) { __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); } else if (truncate) { f2fs_truncate_hole(inode, start_index, cur->index); start_index = cur->index + 1; } list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } if (!revoke && truncate) f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); } static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inode *cow_inode = fi->cow_inode; struct revoke_entry *new; struct list_head revoke_list; block_t blkaddr; struct dnode_of_data dn; pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); pgoff_t off = 0, blen, index; int ret = 0, i; INIT_LIST_HEAD(&revoke_list); while (len) { blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); set_new_dnode(&dn, cow_inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { goto out; } else if (ret == -ENOENT) { ret = 0; if (dn.max_level == 0) goto out; goto next; } blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), len); index = off; for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { blkaddr = f2fs_data_blkaddr(&dn); if (!__is_valid_data_blkaddr(blkaddr)) { continue; } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); ret = -EFSCORRUPTED; goto out; } new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, true, NULL); ret = __replace_atomic_write_block(inode, index, blkaddr, &new->old_addr, false); if (ret) { f2fs_put_dnode(&dn); kmem_cache_free(revoke_entry_slab, new); goto out; } f2fs_update_data_blkaddr(&dn, NULL_ADDR); new->index = index; list_add_tail(&new->list, &revoke_list); } f2fs_put_dnode(&dn); next: off += blen; len -= blen; } out: if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; } else { sbi->committed_atomic_block += fi->atomic_write_cnt; set_inode_flag(inode, FI_ATOMIC_COMMITTED); } __complete_revoke_list(inode, &revoke_list, ret ? true : false); return ret; } int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (err) return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); f2fs_lock_op(sbi); err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); return err; } /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (f2fs_cp_error(sbi)) return; if (time_to_inject(sbi, FAULT_CHECKPOINT)) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi, false); if (!f2fs_is_checkpoint_ready(sbi)) return; /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ if (has_enough_free_secs(sbi, 0, 0)) return; if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && sbi->gc_thread->f2fs_gc_task) { DEFINE_WAIT(wait); prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, TASK_UNINTERRUPTIBLE); wake_up(&sbi->gc_thread->gc_wait_queue_head); io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .init_gc_type = BG_GC, .no_bg_gc = true, .should_migrate_blocks = false, .err_gc_skipped = false, .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); f2fs_gc(sbi, &gc_control); } } static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) { int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int threshold = SEGS_TO_BLKS(sbi, (factor * DEFAULT_DIRTY_THRESHOLD)); unsigned int global_threshold = threshold * 3 / 2; if (dents >= threshold || qdata >= threshold || nodes >= threshold || meta >= threshold || imeta >= threshold) return true; return dents + qdata + nodes + meta + imeta > global_threshold; } void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) { if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return; /* try to shrink extent cache when there is no enough memory */ if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) f2fs_shrink_read_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); /* try to shrink age extent cache when there is no enough memory */ if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE)) f2fs_shrink_age_extent_tree(sbi, AGE_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); if (!f2fs_available_free_memory(sbi, FREE_NIDS)) f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS); else f2fs_build_free_nids(sbi, false, false); if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) || excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi)) goto do_sync; /* there is background inflight IO or foreground operation recently */ if (is_inflight_io(sbi, REQ_TIME) || (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem))) return; /* exceed periodical checkpoint timeout threshold */ if (f2fs_time_over(sbi, CP_TIME)) goto do_sync; /* checkpoint is the only way to shrink partial cached entries */ if (f2fs_available_free_memory(sbi, NAT_ENTRIES) && f2fs_available_free_memory(sbi, INO_ENTRIES)) return; do_sync: if (test_opt(sbi, DATA_FLUSH) && from_bg) { struct blk_plug plug; mutex_lock(&sbi->flush_lock); blk_start_plug(&plug); f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); blk_finish_plug(&plug); mutex_unlock(&sbi->flush_lock); } stat_inc_cp_call_count(sbi, BACKGROUND); f2fs_sync_fs(sbi->sb, 1); } static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { int ret = blkdev_issue_flush(bdev); trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); if (!ret) f2fs_update_iostat(sbi, NULL, FS_FLUSH_IO, 0); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) { int ret = 0; int i; if (!f2fs_is_multi_device(sbi)) return __submit_flush_wait(sbi, sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) { if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO)) continue; ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) break; } return ret; } static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; wait_queue_head_t *q = &fcc->flush_wait_queue; repeat: if (kthread_should_stop()) return 0; if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode); ret = submit_flush_wait(sbi, cmd->ino); atomic_inc(&fcc->issued_flush); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } fcc->dispatch_list = NULL; } wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; } int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; int ret; if (test_opt(sbi, NOBARRIER)) return 0; if (!test_opt(sbi, FLUSH_MERGE)) { atomic_inc(&fcc->queued_flush); ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; } if (atomic_inc_return(&fcc->queued_flush) == 1 || f2fs_is_multi_device(sbi)) { ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; } cmd.ino = ino; init_completion(&cmd.wait); llist_add(&cmd.llnode, &fcc->issue_list); /* * update issue_list before we wake up issue_flush thread, this * smp_mb() pairs with another barrier in ___wait_event(), see * more details in comments of waitqueue_active(). */ smp_mb(); if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); atomic_dec(&fcc->queued_flush); } else { struct llist_node *list; list = llist_del_all(&fcc->issue_list); if (!list) { wait_for_completion(&cmd.wait); atomic_dec(&fcc->queued_flush); } else { struct flush_cmd *tmp, *next; ret = submit_flush_wait(sbi, ino); llist_for_each_entry_safe(tmp, next, list, llnode) { if (tmp == &cmd) { cmd.ret = ret; atomic_dec(&fcc->queued_flush); continue; } tmp->ret = ret; complete(&tmp->wait); } } } return cmd.ret; } int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; if (fcc->f2fs_issue_flush) return 0; goto init_thread; } fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; atomic_set(&fcc->issued_flush, 0); atomic_set(&fcc->queued_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; if (!test_opt(sbi, FLUSH_MERGE)) return 0; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { int err = PTR_ERR(fcc->f2fs_issue_flush); fcc->f2fs_issue_flush = NULL; return err; } return 0; } void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; if (fcc && fcc->f2fs_issue_flush) { struct task_struct *flush_thread = fcc->f2fs_issue_flush; fcc->f2fs_issue_flush = NULL; kthread_stop(flush_thread); } if (free) { kfree(fcc); SM_I(sbi)->fcc_info = NULL; } } int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) { int ret = 0, i; if (!f2fs_is_multi_device(sbi)) return 0; if (test_opt(sbi, NOBARRIER)) return 0; for (i = 1; i < sbi->s_ndevs; i++) { int count = DEFAULT_RETRY_IO_COUNT; if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) continue; do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (ret && --count); if (ret) { f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FLUSH_FAIL); break; } spin_lock(&sbi->dev_lock); f2fs_clear_bit(i, (char *)&sbi->dirty_device); spin_unlock(&sbi->dev_lock); } return ret; } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); /* need not be added */ if (IS_CURSEG(sbi, segno)) return; if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]++; if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); enum dirty_type t = sentry->type; if (unlikely(t >= DIRTY)) { f2fs_bug_on(sbi, 1); return; } if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]++; if (__is_large_section(sbi)) { unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); block_t valid_blocks = get_valid_blocks(sbi, segno, true); f2fs_bug_on(sbi, unlikely(!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi))); if (!IS_CURSEC(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } } static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); block_t valid_blocks; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]--; if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); enum dirty_type t = sentry->type; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; valid_blocks = get_valid_blocks(sbi, segno, true); if (valid_blocks == 0) { clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); #ifdef CONFIG_F2FS_CHECK_FS clear_bit(segno, SIT_I(sbi)->invalid_segmap); #endif } if (__is_large_section(sbi)) { unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) { clear_bit(secno, dirty_i->dirty_secmap); return; } if (!IS_CURSEC(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } } /* * Should not occur error such as -ENOMEM. * Adding dirty entry into seglist is not critical operation. * If a given segment is one of current working segments, it won't be added. */ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned short valid_blocks, ckpt_valid_blocks; unsigned int usable_blocks; if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) return; usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || ckpt_valid_blocks == usable_blocks)) { __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); } else if (valid_blocks < usable_blocks) { __locate_dirty_segment(sbi, segno, DIRTY); } else { /* Recovery routine with SSR needs this */ __remove_dirty_segment(sbi, segno, DIRTY); } mutex_unlock(&dirty_i->seglist_lock); } /* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */ void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; if (IS_CURSEG(sbi, segno)) continue; __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); } mutex_unlock(&dirty_i->seglist_lock); } block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) { int ovp_hole_segs = (overprovision_segments(sbi) - reserved_segments(sbi)); block_t ovp_holes = SEGS_TO_BLKS(sbi, ovp_hole_segs); struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); block_t holes[2] = {0, 0}; /* DATA and NODE */ block_t unusable; struct seg_entry *se; unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { se = get_seg_entry(sbi, segno); if (IS_NODESEG(se->type)) holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) - se->valid_blocks; else holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) - se->valid_blocks; } mutex_unlock(&dirty_i->seglist_lock); unusable = max(holes[DATA], holes[NODE]); if (unusable > ovp_holes) return unusable - ovp_holes; return 0; } int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable) { int ovp_hole_segs = (overprovision_segments(sbi) - reserved_segments(sbi)); if (F2FS_OPTION(sbi).unusable_cap_perc == 100) return 0; if (unusable > F2FS_OPTION(sbi).unusable_cap) return -EAGAIN; if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && dirty_segments(sbi) > ovp_hole_segs) return -EAGAIN; if (has_not_enough_free_secs(sbi, 0, 0)) return -EAGAIN; return 0; } /* This is only used by SBI_CP_DISABLED */ static unsigned int get_free_segment(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno = 0; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; if (get_ckpt_valid_blocks(sbi, segno, false)) continue; mutex_unlock(&dirty_i->seglist_lock); return segno; } mutex_unlock(&dirty_i->seglist_lock); return NULL_SEGNO; } static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc; f2fs_bug_on(sbi, !len); pend_list = &dcc->pend_list[plist_idx(len)]; dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; dc->di.lstart = lstart; dc->di.start = start; dc->di.len = len; dc->ref = 0; dc->state = D_PREP; dc->queued = 0; dc->error = 0; init_completion(&dc->wait); list_add_tail(&dc->list, pend_list); spin_lock_init(&dc->lock); dc->bio_ref = 0; atomic_inc(&dcc->discard_cmd_cnt); dcc->undiscard_blks += len; return dc; } static bool f2fs_check_discard_tree(struct f2fs_sb_info *sbi) { #ifdef CONFIG_F2FS_CHECK_FS struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node *cur = rb_first_cached(&dcc->root), *next; struct discard_cmd *cur_dc, *next_dc; while (cur) { next = rb_next(cur); if (!next) return true; cur_dc = rb_entry(cur, struct discard_cmd, rb_node); next_dc = rb_entry(next, struct discard_cmd, rb_node); if (cur_dc->di.lstart + cur_dc->di.len > next_dc->di.lstart) { f2fs_info(sbi, "broken discard_rbtree, " "cur(%u, %u) next(%u, %u)", cur_dc->di.lstart, cur_dc->di.len, next_dc->di.lstart, next_dc->di.len); return false; } cur = next; } #endif return true; } static struct discard_cmd *__lookup_discard_cmd(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node *node = dcc->root.rb_root.rb_node; struct discard_cmd *dc; while (node) { dc = rb_entry(node, struct discard_cmd, rb_node); if (blkaddr < dc->di.lstart) node = node->rb_left; else if (blkaddr >= dc->di.lstart + dc->di.len) node = node->rb_right; else return dc; } return NULL; } static struct discard_cmd *__lookup_discard_cmd_ret(struct rb_root_cached *root, block_t blkaddr, struct discard_cmd **prev_entry, struct discard_cmd **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent) { struct rb_node **pnode = &root->rb_root.rb_node; struct rb_node *parent = NULL, *tmp_node; struct discard_cmd *dc; *insert_p = NULL; *insert_parent = NULL; *prev_entry = NULL; *next_entry = NULL; if (RB_EMPTY_ROOT(&root->rb_root)) return NULL; while (*pnode) { parent = *pnode; dc = rb_entry(*pnode, struct discard_cmd, rb_node); if (blkaddr < dc->di.lstart) pnode = &(*pnode)->rb_left; else if (blkaddr >= dc->di.lstart + dc->di.len) pnode = &(*pnode)->rb_right; else goto lookup_neighbors; } *insert_p = pnode; *insert_parent = parent; dc = rb_entry(parent, struct discard_cmd, rb_node); tmp_node = parent; if (parent && blkaddr > dc->di.lstart) tmp_node = rb_next(parent); *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); tmp_node = parent; if (parent && blkaddr < dc->di.lstart) tmp_node = rb_prev(parent); *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); return NULL; lookup_neighbors: /* lookup prev node for merging backward later */ tmp_node = rb_prev(&dc->rb_node); *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); /* lookup next node for merging frontward later */ tmp_node = rb_next(&dc->rb_node); *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); return dc; } static void __detach_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { if (dc->state == D_DONE) atomic_sub(dc->queued, &dcc->queued_discard); list_del(&dc->list); rb_erase_cached(&dc->rb_node, &dcc->root); dcc->undiscard_blks -= dc->di.len; kmem_cache_free(discard_cmd_slab, dc); atomic_dec(&dcc->discard_cmd_cnt); } static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; unsigned long flags; trace_f2fs_remove_discard(dc->bdev, dc->di.start, dc->di.len); spin_lock_irqsave(&dc->lock, flags); if (dc->bio_ref) { spin_unlock_irqrestore(&dc->lock, flags); return; } spin_unlock_irqrestore(&dc->lock, flags); f2fs_bug_on(sbi, dc->ref); if (dc->error == -EOPNOTSUPP) dc->error = 0; if (dc->error) printk_ratelimited( "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d", KERN_INFO, sbi->sb->s_id, dc->di.lstart, dc->di.start, dc->di.len, dc->error); __detach_discard_cmd(dcc, dc); } static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; unsigned long flags; spin_lock_irqsave(&dc->lock, flags); if (!dc->error) dc->error = blk_status_to_errno(bio->bi_status); dc->bio_ref--; if (!dc->bio_ref && dc->state == D_SUBMIT) { dc->state = D_DONE; complete_all(&dc->wait); } spin_unlock_irqrestore(&dc->lock, flags); bio_put(bio); } static void __check_sit_bitmap(struct f2fs_sb_info *sbi, block_t start, block_t end) { #ifdef CONFIG_F2FS_CHECK_FS struct seg_entry *sentry; unsigned int segno; block_t blk = start; unsigned long offset, size, *map; while (blk < end) { segno = GET_SEGNO(sbi, blk); sentry = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blk); if (end < START_BLOCK(sbi, segno + 1)) size = GET_BLKOFF_FROM_SEG0(sbi, end); else size = BLKS_PER_SEG(sbi); map = (unsigned long *)(sentry->cur_valid_map); offset = __find_rev_next_bit(map, size, offset); f2fs_bug_on(sbi, offset != size); blk = START_BLOCK(sbi, segno + 1); } #endif } static void __init_discard_policy(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int discard_type, unsigned int granularity) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; dpolicy->ordered = false; dpolicy->granularity = granularity; dpolicy->max_requests = dcc->max_discard_request; dpolicy->io_aware_gran = dcc->discard_io_aware_gran; dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = dcc->min_discard_issue_time; dpolicy->mid_interval = dcc->mid_discard_issue_time; dpolicy->max_interval = dcc->max_discard_issue_time; if (dcc->discard_io_aware == DPOLICY_IO_AWARE_ENABLE) dpolicy->io_aware = true; else if (dcc->discard_io_aware == DPOLICY_IO_AWARE_DISABLE) dpolicy->io_aware = false; dpolicy->sync = false; dpolicy->ordered = true; if (utilization(sbi) > dcc->discard_urgent_util) { dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = dcc->min_discard_issue_time; } } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = dcc->min_discard_issue_time; dpolicy->mid_interval = dcc->mid_discard_issue_time; dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ dpolicy->granularity = MIN_DISCARD_GRANULARITY; dpolicy->timeout = true; } } static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len); #ifdef CONFIG_BLK_DEV_ZONED static void __submit_zone_reset_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc, blk_opf_t flag, struct list_head *wait_list, unsigned int *issued) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct block_device *bdev = dc->bdev; struct bio *bio = bio_alloc(bdev, 0, REQ_OP_ZONE_RESET | flag, GFP_NOFS); unsigned long flags; trace_f2fs_issue_reset_zone(bdev, dc->di.start); spin_lock_irqsave(&dc->lock, flags); dc->state = D_SUBMIT; dc->bio_ref++; spin_unlock_irqrestore(&dc->lock, flags); if (issued) (*issued)++; atomic_inc(&dcc->queued_discard); dc->queued++; list_move_tail(&dc->list, wait_list); /* sanity check on discard range */ __check_sit_bitmap(sbi, dc->di.lstart, dc->di.lstart + dc->di.len); bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(dc->di.start); bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(bio); atomic_inc(&dcc->issued_discard); f2fs_update_iostat(sbi, NULL, FS_ZONE_RESET_IO, dc->di.len * F2FS_BLKSIZE); } #endif /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, struct discard_cmd *dc, int *issued) { struct block_device *bdev = dc->bdev; unsigned int max_discard_blocks = SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0; block_t lstart, start, len, total_len; int err = 0; if (dc->state != D_PREP) return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return 0; #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) { int devi = f2fs_bdev_index(sbi, bdev); if (devi < 0) return -EINVAL; if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { __submit_zone_reset_cmd(sbi, dc, flag, wait_list, issued); return 0; } } #endif trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); lstart = dc->di.lstart; start = dc->di.start; len = dc->di.len; total_len = len; dc->di.len = 0; while (total_len && *issued < dpolicy->max_requests && !err) { struct bio *bio = NULL; unsigned long flags; bool last = true; if (len > max_discard_blocks) { len = max_discard_blocks; last = false; } (*issued)++; if (*issued == dpolicy->max_requests) last = true; dc->di.len += len; if (time_to_inject(sbi, FAULT_DISCARD)) { err = -EIO; } else { err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); } if (err) { spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) dc->state = D_SUBMIT; spin_unlock_irqrestore(&dc->lock, flags); break; } f2fs_bug_on(sbi, !bio); /* * should keep before submission to avoid D_DONE * right away */ spin_lock_irqsave(&dc->lock, flags); if (last) dc->state = D_SUBMIT; else dc->state = D_PARTIAL; dc->bio_ref++; spin_unlock_irqrestore(&dc->lock, flags); atomic_inc(&dcc->queued_discard); dc->queued++; list_move_tail(&dc->list, wait_list); /* sanity check on discard range */ __check_sit_bitmap(sbi, lstart, lstart + len); bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; bio->bi_opf |= flag; submit_bio(bio); atomic_inc(&dcc->issued_discard); f2fs_update_iostat(sbi, NULL, FS_DISCARD_IO, len * F2FS_BLKSIZE); lstart += len; start += len; total_len -= len; len = total_len; } if (!err && len) { dcc->undiscard_blks -= len; __update_discard_tree_range(sbi, bdev, lstart, start, len); } return err; } static void __insert_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node **p = &dcc->root.rb_root.rb_node; struct rb_node *parent = NULL; struct discard_cmd *dc; bool leftmost = true; /* look up rb tree to find parent node */ while (*p) { parent = *p; dc = rb_entry(parent, struct discard_cmd, rb_node); if (lstart < dc->di.lstart) { p = &(*p)->rb_left; } else if (lstart >= dc->di.lstart + dc->di.len) { p = &(*p)->rb_right; leftmost = false; } else { /* Let's skip to add, if exists */ return; } } dc = __create_discard_cmd(sbi, bdev, lstart, start, len); rb_link_node(&dc->rb_node, parent, p); rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); } static void __relocate_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->di.len)]); } static void __punch_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_info di = dc->di; bool modified = false; if (dc->state == D_DONE || dc->di.len == 1) { __remove_discard_cmd(sbi, dc); return; } dcc->undiscard_blks -= di.len; if (blkaddr > di.lstart) { dc->di.len = blkaddr - dc->di.lstart; dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); modified = true; } if (blkaddr < di.lstart + di.len - 1) { if (modified) { __insert_discard_cmd(sbi, dc->bdev, blkaddr + 1, di.start + blkaddr + 1 - di.lstart, di.lstart + di.len - 1 - blkaddr); } else { dc->di.lstart++; dc->di.len--; dc->di.start++; dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); } } } static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct discard_cmd *dc; struct discard_info di = {0}; struct rb_node **insert_p = NULL, *insert_parent = NULL; unsigned int max_discard_blocks = SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); block_t end = lstart + len; dc = __lookup_discard_cmd_ret(&dcc->root, lstart, &prev_dc, &next_dc, &insert_p, &insert_parent); if (dc) prev_dc = dc; if (!prev_dc) { di.lstart = lstart; di.len = next_dc ? next_dc->di.lstart - lstart : len; di.len = min(di.len, len); di.start = start; } while (1) { struct rb_node *node; bool merged = false; struct discard_cmd *tdc = NULL; if (prev_dc) { di.lstart = prev_dc->di.lstart + prev_dc->di.len; if (di.lstart < lstart) di.lstart = lstart; if (di.lstart >= end) break; if (!next_dc || next_dc->di.lstart > end) di.len = end - di.lstart; else di.len = next_dc->di.lstart - di.lstart; di.start = start + di.lstart - lstart; } if (!di.len) goto next; if (prev_dc && prev_dc->state == D_PREP && prev_dc->bdev == bdev && __is_discard_back_mergeable(&di, &prev_dc->di, max_discard_blocks)) { prev_dc->di.len += di.len; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); di = prev_dc->di; tdc = prev_dc; merged = true; } if (next_dc && next_dc->state == D_PREP && next_dc->bdev == bdev && __is_discard_front_mergeable(&di, &next_dc->di, max_discard_blocks)) { next_dc->di.lstart = di.lstart; next_dc->di.len += di.len; next_dc->di.start = di.start; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); merged = true; } if (!merged) __insert_discard_cmd(sbi, bdev, di.lstart, di.start, di.len); next: prev_dc = next_dc; if (!prev_dc) break; node = rb_next(&prev_dc->rb_node); next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); } } #ifdef CONFIG_BLK_DEV_ZONED static void __queue_zone_reset_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t lblkstart, block_t blklen) { trace_f2fs_queue_reset_zone(bdev, blkstart); mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __insert_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); } #endif static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { block_t lblkstart = blkstart; if (!f2fs_bdev_support_discard(bdev)) return; trace_f2fs_queue_discard(bdev, blkstart, blklen); if (f2fs_is_multi_device(sbi)) { int devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); } static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int *issued) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; struct blk_plug plug; bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); dc = __lookup_discard_cmd_ret(&dcc->root, dcc->next_pos, &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; blk_start_plug(&plug); while (dc) { struct rb_node *node; int err = 0; if (dc->state != D_PREP) goto next; if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } dcc->next_pos = dc->di.lstart + dc->di.len; err = __submit_discard_cmd(sbi, dpolicy, dc, issued); if (*issued >= dpolicy->max_requests) break; next: node = rb_next(&dc->rb_node); if (err) __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); } blk_finish_plug(&plug); if (!dc) dcc->next_pos = 0; mutex_unlock(&dcc->cmd_lock); if (!(*issued) && io_interrupted) *issued = -1; } static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy); static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; int i, issued; bool io_interrupted = false; if (dpolicy->timeout) f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT); retry: issued = 0; for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (dpolicy->timeout && f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (i + 1 < dpolicy->granularity) break; if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) { __issue_discard_cmd_orderly(sbi, dpolicy, &issued); return issued; } pend_list = &dcc->pend_list[i]; mutex_lock(&dcc->cmd_lock); if (list_empty(pend_list)) goto next; if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); if (dpolicy->timeout && f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (dpolicy->io_aware && i < dpolicy->io_aware_gran && !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) break; } blk_finish_plug(&plug); next: mutex_unlock(&dcc->cmd_lock); if (issued >= dpolicy->max_requests || io_interrupted) break; } if (dpolicy->type == DPOLICY_UMOUNT && issued) { __wait_all_discard_cmd(sbi, dpolicy); goto retry; } if (!issued && io_interrupted) issued = -1; return issued; } static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; int i; bool dropped = false; mutex_lock(&dcc->cmd_lock); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); __remove_discard_cmd(sbi, dc); dropped = true; } } mutex_unlock(&dcc->cmd_lock); return dropped; } void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi) { __drop_discard_cmd(sbi); } static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; unsigned int len = 0; wait_for_completion_io(&dc->wait); mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, dc->state != D_DONE); dc->ref--; if (!dc->ref) { if (!dc->error) len = dc->di.len; __remove_discard_cmd(sbi, dc); } mutex_unlock(&dcc->cmd_lock); return len; } static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, block_t start, block_t end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: dc = NULL; mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(iter, tmp, wait_list, list) { if (iter->di.lstart + iter->di.len <= start || end <= iter->di.lstart) continue; if (iter->di.len < dpolicy->granularity) continue; if (iter->state == D_DONE && !iter->ref) { wait_for_completion_io(&iter->wait); if (!iter->error) trimmed += iter->di.len; __remove_discard_cmd(sbi, iter); } else { iter->ref++; dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } return trimmed; } static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { struct discard_policy dp; unsigned int discard_blks; if (dpolicy) return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); /* wait all */ __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, MIN_DISCARD_GRANULARITY); discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, MIN_DISCARD_GRANULARITY); discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); return discard_blks; } /* This should be covered by global mutex, &sit_i->sentry_lock */ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *dc; bool need_wait = false; mutex_lock(&dcc->cmd_lock); dc = __lookup_discard_cmd(sbi, blkaddr); #ifdef CONFIG_BLK_DEV_ZONED if (dc && f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(dc->bdev)) { int devi = f2fs_bdev_index(sbi, dc->bdev); if (devi < 0) { mutex_unlock(&dcc->cmd_lock); return; } if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { /* force submit zone reset */ if (dc->state == D_PREP) __submit_zone_reset_cmd(sbi, dc, REQ_SYNC, &dcc->wait_list, NULL); dc->ref++; mutex_unlock(&dcc->cmd_lock); /* wait zone reset */ __wait_one_discard_bio(sbi, dc); return; } } #endif if (dc) { if (dc->state == D_PREP) { __punch_discard_cmd(sbi, dc, blkaddr); } else { dc->ref++; need_wait = true; } } mutex_unlock(&dcc->cmd_lock); if (need_wait) __wait_one_discard_bio(sbi, dc); } void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; if (dcc && dcc->f2fs_issue_discard) { struct task_struct *discard_thread = dcc->f2fs_issue_discard; dcc->f2fs_issue_discard = NULL; kthread_stop(discard_thread); } } /** * f2fs_issue_discard_timeout() - Issue all discard cmd within UMOUNT_DISCARD_TIMEOUT * @sbi: the f2fs_sb_info data for discard cmd to issue * * When UMOUNT_DISCARD_TIMEOUT is exceeded, all remaining discard commands will be dropped * * Return true if issued all discard cmd or no discard cmd need issue, otherwise return false. */ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; bool dropped; if (!atomic_read(&dcc->discard_cmd_cnt)) return true; __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); /* just to make sure there is no pending discard commands */ __wait_all_discard_cmd(sbi, NULL); f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); return !dropped; } static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; struct discard_policy dpolicy; unsigned int wait_ms = dcc->min_discard_issue_time; int issued; set_freezable(); do { wait_event_freezable_timeout(*q, kthread_should_stop() || dcc->discard_wake, msecs_to_jiffies(wait_ms)); if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, MIN_DISCARD_GRANULARITY); else __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); if (dcc->discard_wake) dcc->discard_wake = false; /* clean up pending candidates before going to sleep */ if (atomic_read(&dcc->queued_discard)) __wait_all_discard_cmd(sbi, NULL); if (f2fs_readonly(sbi->sb)) continue; if (kthread_should_stop()) return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || !atomic_read(&dcc->discard_cmd_cnt)) { wait_ms = dpolicy.max_interval; continue; } sb_start_intwrite(sbi->sb); issued = __issue_discard_cmd(sbi, &dpolicy); if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; } else if (issued == -1) { wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); if (!wait_ms) wait_ms = dpolicy.mid_interval; } else { wait_ms = dpolicy.max_interval; } if (!atomic_read(&dcc->discard_cmd_cnt)) wait_ms = dpolicy.max_interval; sb_end_intwrite(sbi->sb); } while (!kthread_should_stop()); return 0; } #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t sector, nr_sects; block_t lblkstart = blkstart; int devi = 0; u64 remainder = 0; if (f2fs_is_multi_device(sbi)) { devi = f2fs_target_device_index(sbi, blkstart); if (blkstart < FDEV(devi).start_blk || blkstart > FDEV(devi).end_blk) { f2fs_err(sbi, "Invalid block %x", blkstart); return -EIO; } blkstart -= FDEV(devi).start_blk; } /* For sequential zones, reset the zone write pointer */ if (f2fs_blkz_is_seq(sbi, devi, blkstart)) { sector = SECTOR_FROM_BLOCK(blkstart); nr_sects = SECTOR_FROM_BLOCK(blklen); div64_u64_rem(sector, bdev_zone_sectors(bdev), &remainder); if (remainder || nr_sects != bdev_zone_sectors(bdev)) { f2fs_err(sbi, "(%d) %s: Unaligned zone reset attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path : "", blkstart, blklen); return -EIO; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) { unsigned int nofs_flags; int ret; trace_f2fs_issue_reset_zone(bdev, blkstart); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, sector, nr_sects); memalloc_nofs_restore(nofs_flags); return ret; } __queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen); return 0; } /* For conventional zones, use regular discard if supported */ __queue_discard_cmd(sbi, bdev, lblkstart, blklen); return 0; } #endif static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif __queue_discard_cmd(sbi, bdev, blkstart, blklen); return 0; } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { sector_t start = blkstart, len = 0; struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; int err = 0; bdev = f2fs_target_device(sbi, blkstart, NULL); for (i = blkstart; i < blkstart + blklen; i++, len++) { if (i != start) { struct block_device *bdev2 = f2fs_target_device(sbi, i, NULL); if (bdev2 != bdev) { err = __issue_discard_async(sbi, bdev, start, len); if (err) return err; bdev = bdev2; start = i; len = 0; } } se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); if (f2fs_block_unit_discard(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } if (len) err = __issue_discard_async(sbi, bdev, start, len); return err; } static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); unsigned long *cur_map = (unsigned long *)se->cur_valid_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; bool force = (cpc->reason & CP_DISCARD); struct discard_entry *de = NULL; struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == BLKS_PER_SEG(sbi) || !f2fs_hw_support_discard(sbi) || !f2fs_block_unit_discard(sbi)) return false; if (!force) { if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; } /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ for (i = 0; i < entries; i++) dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; while (force || SM_I(sbi)->dcc_info->nr_discards <= SM_I(sbi)->dcc_info->max_discards) { start = __find_rev_next_bit(dmap, BLKS_PER_SEG(sbi), end + 1); if (start >= BLKS_PER_SEG(sbi)) break; end = __find_rev_next_zero_bit(dmap, BLKS_PER_SEG(sbi), start + 1); if (force && start && end != BLKS_PER_SEG(sbi) && (end - start) < cpc->trim_minlen) continue; if (check_only) return true; if (!de) { de = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_F2FS_ZERO, true, NULL); de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); list_add_tail(&de->list, head); } for (i = start; i < end; i++) __set_bit_le(i, (void *)de->discard_map); SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } static void release_discard_addr(struct discard_entry *entry) { list_del(&entry->list); kmem_cache_free(discard_entry_slab, entry); } void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ list_for_each_entry_safe(entry, this, head, list) release_discard_addr(entry); } /* * Should call f2fs_clear_prefree_segments after checkpoint is done. */ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) __set_test_and_free(sbi, segno, false); mutex_unlock(&dirty_i->seglist_lock); } void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); bool section_alignment = F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION; if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) section_alignment = true; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; if (section_alignment && end != -1) end--; start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); if (start >= MAIN_SEGS(sbi)) break; end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), start + 1); if (section_alignment) { start = rounddown(start, SEGS_PER_SEC(sbi)); end = roundup(end, SEGS_PER_SEC(sbi)); } for (i = start; i < end; i++) { if (test_and_clear_bit(i, prefree_map)) dirty_i->nr_dirty[PRE]--; } if (!f2fs_realtime_discard_enable(sbi)) continue; if (force && start >= cpc->trim_start && (end - 1) <= cpc->trim_end) continue; /* Should cover 2MB zoned device for zone-based reset */ if (!f2fs_sb_has_blkzoned(sbi) && (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), SEGS_TO_BLKS(sbi, end - start)); continue; } next: secno = GET_SEC_FROM_SEG(sbi, start); start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), BLKS_PER_SEC(sbi)); start = start_segno + SEGS_PER_SEC(sbi); if (start < end) goto next; else end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); if (!f2fs_block_unit_discard(sbi)) goto wakeup; /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { unsigned int cur_pos = 0, next_pos, len, total_len = 0; bool is_valid = test_bit_le(0, entry->discard_map); find_next: if (is_valid) { next_pos = find_next_zero_bit_le(entry->discard_map, BLKS_PER_SEG(sbi), cur_pos); len = next_pos - cur_pos; if (f2fs_sb_has_blkzoned(sbi) || (force && len < cpc->trim_minlen)) goto skip; f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, len); total_len += len; } else { next_pos = find_next_bit_le(entry->discard_map, BLKS_PER_SEG(sbi), cur_pos); } skip: cur_pos = next_pos; is_valid = !is_valid; if (cur_pos < BLKS_PER_SEG(sbi)) goto find_next; release_discard_addr(entry); dcc->nr_discards -= total_len; } wakeup: wake_up_discard_thread(sbi, false); } int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; int err = 0; if (f2fs_sb_has_readonly(sbi)) { f2fs_info(sbi, "Skip to start discard thread for readonly image"); return 0; } if (!f2fs_realtime_discard_enable(sbi)) return 0; dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); dcc->f2fs_issue_discard = NULL; } return err; } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc; int err = 0, i; if (SM_I(sbi)->dcc_info) { dcc = SM_I(sbi)->dcc_info; goto init_thread; } dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL); if (!dcc) return -ENOMEM; dcc->discard_io_aware_gran = MAX_PLIST_NUM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) dcc->discard_granularity = BLKS_PER_SEG(sbi); else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) dcc->discard_granularity = BLKS_PER_SEC(sbi); INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); INIT_LIST_HEAD(&dcc->wait_list); INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->queued_discard, 0); atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = SEGS_TO_BLKS(sbi, MAIN_SEGS(sbi)); dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST; dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; dcc->rbtree_check = false; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: err = f2fs_start_discard_thread(sbi); if (err) { kfree(dcc); SM_I(sbi)->dcc_info = NULL; } return err; } static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; if (!dcc) return; f2fs_stop_discard_thread(sbi); /* * Recovery can cache discard commands, so in error path of * fill_super(), it needs to give a chance to handle them. */ f2fs_issue_discard_timeout(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) { sit_i->dirty_sentries++; return false; } return true; } static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, unsigned int segno, int modified) { struct seg_entry *se = get_seg_entry(sbi, segno); se->type = type; if (modified) __mark_sit_entry_dirty(sbi, segno); } static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr) { unsigned int segno = GET_SEGNO(sbi, blkaddr); if (segno == NULL_SEGNO) return 0; return get_seg_entry(sbi, segno)->mtime; } static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned long long old_mtime) { struct seg_entry *se; unsigned int segno = GET_SEGNO(sbi, blkaddr); unsigned long long ctime = get_mtime(sbi, false); unsigned long long mtime = old_mtime ? old_mtime : ctime; if (segno == NULL_SEGNO) return; se = get_seg_entry(sbi, segno); if (!se->mtime) se->mtime = mtime; else se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, se->valid_blocks + 1); if (ctime > SIT_I(sbi)->max_mtime) SIT_I(sbi)->max_mtime = ctime; } static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) { struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; bool exist; #ifdef CONFIG_F2FS_CHECK_FS bool mir_exist; #endif segno = GET_SEGNO(sbi, blkaddr); if (segno == NULL_SEGNO) return; se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); f2fs_bug_on(sbi, (new_vblocks < 0 || (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); se->valid_blocks = new_vblocks; /* Update valid block bitmap */ if (del > 0) { exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS mir_exist = f2fs_test_and_set_bit(offset, se->cur_valid_map_mir); if (unlikely(exist != mir_exist)) { f2fs_err(sbi, "Inconsistent error when setting bitmap, blk:%u, old bit:%d", blkaddr, exist); f2fs_bug_on(sbi, 1); } #endif if (unlikely(exist)) { f2fs_err(sbi, "Bitmap was wrongly set, blk:%u", blkaddr); f2fs_bug_on(sbi, 1); se->valid_blocks--; del = 0; } if (f2fs_block_unit_discard(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* * SSR should never reuse block which is checkpointed * or newly invalidated. */ if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks++; } } else { exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS mir_exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map_mir); if (unlikely(exist != mir_exist)) { f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d", blkaddr, exist); f2fs_bug_on(sbi, 1); } #endif if (unlikely(!exist)) { f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u", blkaddr); f2fs_bug_on(sbi, 1); se->valid_blocks++; del = 0; } else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { /* * If checkpoints are off, we must not reuse data that * was used in the previous checkpoint. If it was used * before, we must track that to know how much space we * really have. */ if (f2fs_test_bit(offset, se->ckpt_valid_map)) { spin_lock(&sbi->stat_lock); sbi->unusable_block_count++; spin_unlock(&sbi->stat_lock); } } if (f2fs_block_unit_discard(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks += del; __mark_sit_entry_dirty(sbi, segno); /* update total number of valid blocks to be written in ckpt area */ SIT_I(sbi)->written_valid_blocks += del; if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->valid_blocks += del; } void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) { unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); f2fs_bug_on(sbi, addr == NULL_ADDR); if (addr == NEW_ADDR || addr == COMPRESS_ADDR) return; f2fs_invalidate_internal_cache(sbi, addr); /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); update_segment_mtime(sbi, addr, 0); update_sit_entry(sbi, addr, -1); /* add it into dirty seglist */ locate_dirty_segment(sbi, segno); up_write(&sit_i->sentry_lock); } bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) { struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, offset; struct seg_entry *se; bool is_cp = false; if (!__is_valid_data_blkaddr(blkaddr)) return true; down_read(&sit_i->sentry_lock); segno = GET_SEGNO(sbi, blkaddr); se = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); if (f2fs_test_bit(offset, se->ckpt_valid_map)) is_cp = true; up_read(&sit_i->sentry_lock); return is_cp; } static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); if (sbi->ckpt->alloc_type[type] == SSR) return BLKS_PER_SEG(sbi); return curseg->next_blkoff; } /* * Calculate the number of current summary pages for writing */ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) { int valid_sum_count = 0; int i, sum_in_page; for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { if (sbi->ckpt->alloc_type[i] != SSR && for_ra) valid_sum_count += le16_to_cpu(F2FS_CKPT(sbi)->cur_data_blkoff[i]); else valid_sum_count += f2fs_curseg_valid_blocks(sbi, i); } sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } /* * Caller should put this summary page */ struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { if (unlikely(f2fs_cp_error(sbi))) return ERR_PTR(-EIO); return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct page *page = f2fs_grab_meta_page(sbi, blk_addr); memcpy(page_address(page), src, PAGE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } static void write_sum_page(struct f2fs_sb_info *sbi, struct f2fs_summary_block *sum_blk, block_t blk_addr) { f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr); } static void write_current_sum_page(struct f2fs_sb_info *sbi, int type, block_t blk_addr) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct page *page = f2fs_grab_meta_page(sbi, blk_addr); struct f2fs_summary_block *src = curseg->sum_blk; struct f2fs_summary_block *dst; dst = (struct f2fs_summary_block *)page_address(page); memset(dst, 0, PAGE_SIZE); mutex_lock(&curseg->curseg_mutex); down_read(&curseg->journal_rwsem); memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); up_read(&curseg->journal_rwsem); memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); mutex_unlock(&curseg->curseg_mutex); set_page_dirty(page); f2fs_put_page(page, 1); } static int is_next_segment_free(struct f2fs_sb_info *sbi, struct curseg_info *curseg, int type) { unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); if (segno < MAIN_SEGS(sbi) && segno % SEGS_PER_SEC(sbi)) return !test_bit(segno, free_i->free_segmap); return 0; } /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG */ static int get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, bool pinning) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); bool init = true; int i; int ret = 0; spin_lock(&free_i->segmap_lock); if (time_to_inject(sbi, FAULT_NO_SEGMENT)) { ret = -ENOSPC; goto out_unlock; } if (!new_sec && ((*newseg + 1) % SEGS_PER_SEC(sbi))) { segno = find_next_zero_bit(free_i->free_segmap, GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } /* * If we format f2fs on zoned storage, let's try to get pinned sections * from beginning of the storage, which should be a conventional one. */ if (f2fs_sb_has_blkzoned(sbi)) { segno = pinning ? 0 : max(first_zoned_segno(sbi), *newseg); hint = GET_SEC_FROM_SEG(sbi, segno); } find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); if (secno >= MAIN_SECS(sbi)) { secno = find_first_zero_bit(free_i->free_secmap, MAIN_SECS(sbi)); if (secno >= MAIN_SECS(sbi)) { ret = -ENOSPC; goto out_unlock; } } segno = GET_SEG_FROM_SEC(sbi, secno); zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) goto got_it; if (sbi->secs_per_zone == 1) goto got_it; if (zoneno == old_zoneno) goto got_it; for (i = 0; i < NR_CURSEG_TYPE; i++) if (CURSEG_I(sbi, i)->zone == zoneno) break; if (i < NR_CURSEG_TYPE) { /* zone is in user, try another */ if (zoneno + 1 >= total_zones) hint = 0; else hint = (zoneno + 1) * sbi->secs_per_zone; init = false; goto find_other_zone; } got_it: /* set it as dirty segment in free segmap */ f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); /* no free section in conventional zone */ if (new_sec && pinning && !f2fs_valid_pinned_area(sbi, START_BLOCK(sbi, segno))) { ret = -EAGAIN; goto out_unlock; } __set_inuse(sbi, segno); *newseg = segno; out_unlock: spin_unlock(&free_i->segmap_lock); if (ret == -ENOSPC) { f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT); f2fs_bug_on(sbi, 1); } return ret; } static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct summary_footer *sum_footer; unsigned short seg_type = curseg->seg_type; /* only happen when get_new_segment() fails */ if (curseg->next_segno == NULL_SEGNO) return; curseg->inited = true; curseg->segno = curseg->next_segno; curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; sum_footer = &(curseg->sum_blk->footer); memset(sum_footer, 0, sizeof(struct summary_footer)); sanity_check_seg_type(sbi, seg_type); if (IS_DATASEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); if (IS_NODESEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); __set_sit_entry_type(sbi, seg_type, curseg->segno, modified); } static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned short seg_type = curseg->seg_type; sanity_check_seg_type(sbi, seg_type); if (f2fs_need_rand_seg(sbi)) return get_random_u32_below(MAIN_SECS(sbi) * SEGS_PER_SEC(sbi)); if (__is_large_section(sbi)) return curseg->segno; /* inmem log may not locate on any segment after mount */ if (!curseg->inited) return 0; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; if (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) return SIT_I(sbi)->last_victim[ALLOC_NEXT]; /* find segments from 0 to reuse freed segments */ if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) return 0; return curseg->segno; } /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. */ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno = curseg->segno; bool pinning = type == CURSEG_COLD_DATA_PINNED; int ret; if (curseg->inited) write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno)); segno = __get_next_segno(sbi, type); ret = get_new_segment(sbi, &segno, new_sec, pinning); if (ret) { if (ret == -ENOSPC) curseg->segno = NULL_SEGNO; return ret; } curseg->next_segno = segno; reset_curseg(sbi, type, 1); curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = get_random_u32_inclusive(1, sbi->max_fragment_chunk); return 0; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, int segno, block_t start) { struct seg_entry *se = get_seg_entry(sbi, segno); int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); unsigned long *target_map = SIT_I(sbi)->tmp_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *cur_map = (unsigned long *)se->cur_valid_map; int i; for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; return __find_rev_next_zero_bit(target_map, BLKS_PER_SEG(sbi), start); } static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi, struct curseg_info *seg) { return __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1); } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) { return __next_free_blkoff(sbi, segno, 0) < BLKS_PER_SEG(sbi); } /* * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ static int change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int new_segno = curseg->next_segno; struct f2fs_summary_block *sum_node; struct page *sum_page; write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); mutex_lock(&dirty_i->seglist_lock); __remove_dirty_segment(sbi, new_segno, PRE); __remove_dirty_segment(sbi, new_segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); reset_curseg(sbi, type, 1); curseg->alloc_type = SSR; curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); if (IS_ERR(sum_page)) { /* GC won't be able to use stale summary pages by cp_error */ memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); return PTR_ERR(sum_page); } sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); return 0; } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, int alloc_mode, unsigned long long age); static int get_atssr_segment(struct f2fs_sb_info *sbi, int type, int target_type, int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); int ret = 0; curseg->seg_type = target_type; if (get_ssr_segment(sbi, type, alloc_mode, age)) { struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); curseg->seg_type = se->type; ret = change_curseg(sbi, type); } else { /* allocate cold segment by default */ curseg->seg_type = CURSEG_COLD_DATA; ret = new_curseg(sbi, type, true); } stat_inc_seg_type(sbi, curseg); return ret; } static int __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC); int ret = 0; if (!sbi->am.atgc_enabled) return 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); ret = get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0); up_write(&SIT_I(sbi)->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) { return __f2fs_init_atgc_curseg(sbi); } static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); if (!curseg->inited) goto out; if (get_valid_blocks(sbi, curseg->segno, false)) { write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); } else { mutex_lock(&DIRTY_I(sbi)->seglist_lock); __set_test_and_free(sbi, curseg->segno, true); mutex_unlock(&DIRTY_I(sbi)->seglist_lock); } out: mutex_unlock(&curseg->curseg_mutex); } void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi) { __f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); if (sbi->am.atgc_enabled) __f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); } static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); if (!curseg->inited) goto out; if (get_valid_blocks(sbi, curseg->segno, false)) goto out; mutex_lock(&DIRTY_I(sbi)->seglist_lock); __set_test_and_inuse(sbi, curseg->segno); mutex_unlock(&DIRTY_I(sbi)->seglist_lock); out: mutex_unlock(&curseg->curseg_mutex); } void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi) { __f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); if (sbi->am.atgc_enabled) __f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned segno = NULL_SEGNO; unsigned short seg_type = curseg->seg_type; int i, cnt; bool reversed = false; sanity_check_seg_type(sbi, seg_type); /* f2fs_need_SSR() already forces to do this */ if (!f2fs_get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) { curseg->next_segno = segno; return 1; } /* For node segments, let's do SSR more intensively */ if (IS_NODESEG(seg_type)) { if (seg_type >= CURSEG_WARM_NODE) { reversed = true; i = CURSEG_COLD_NODE; } else { i = CURSEG_HOT_NODE; } cnt = NR_CURSEG_NODE_TYPE; } else { if (seg_type >= CURSEG_WARM_DATA) { reversed = true; i = CURSEG_COLD_DATA; } else { i = CURSEG_HOT_DATA; } cnt = NR_CURSEG_DATA_TYPE; } for (; cnt-- > 0; reversed ? i-- : i++) { if (i == seg_type) continue; if (!f2fs_get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) { curseg->next_segno = segno; return 1; } } /* find valid_blocks=0 in dirty list */ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { segno = get_free_segment(sbi); if (segno != NULL_SEGNO) { curseg->next_segno = segno; return 1; } } return 0; } static bool need_new_seg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && curseg->seg_type == CURSEG_WARM_NODE) return true; if (curseg->alloc_type == LFS && is_next_segment_free(sbi, curseg, type) && likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) return true; return false; } int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno; int ret = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); segno = CURSEG_I(sbi, type)->segno; if (segno < start || segno > end) goto unlock; if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) ret = change_curseg(sbi, type); else ret = new_curseg(sbi, type, true); stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, segno); unlock: up_write(&SIT_I(sbi)->sentry_lock); if (segno != curseg->segno) f2fs_notice(sbi, "For resize: curseg of type %d: %u ==> %u", type, segno, curseg->segno); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } static int __allocate_new_segment(struct f2fs_sb_info *sbi, int type, bool new_sec, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; int err = 0; if (type == CURSEG_COLD_DATA_PINNED && !curseg->inited) goto allocate; if (!force && curseg->inited && !curseg->next_blkoff && !get_valid_blocks(sbi, curseg->segno, new_sec) && !get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) return 0; allocate: old_segno = curseg->segno; err = new_curseg(sbi, type, true); if (err) return err; stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); return 0; } int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { int ret; f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); ret = __allocate_new_segment(sbi, type, true, force); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi) { int err; bool gc_required = true; retry: f2fs_lock_op(sbi); err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { f2fs_down_write(&sbi->gc_lock); err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), true, 1); f2fs_up_write(&sbi->gc_lock); gc_required = false; if (!err) goto retry; } return err; } int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; int err = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) err += __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); return err; } bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; bool has_candidate = false; down_write(&SIT_I(sbi)->sentry_lock); for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { if (add_discard_addrs(sbi, cpc, true)) { has_candidate = true; break; } } up_write(&SIT_I(sbi)->sentry_lock); cpc->trim_start = trim_start; return has_candidate; } static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, unsigned int start, unsigned int end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; struct blk_plug plug; int issued; unsigned int trimmed = 0; next: issued = 0; mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); dc = __lookup_discard_cmd_ret(&dcc->root, start, &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; blk_start_plug(&plug); while (dc && dc->di.lstart <= end) { struct rb_node *node; int err = 0; if (dc->di.len < dpolicy->granularity) goto skip; if (dc->state != D_PREP) { list_move_tail(&dc->list, &dcc->fstrim_list); goto skip; } err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) { start = dc->di.lstart + dc->di.len; if (err) __remove_discard_cmd(sbi, dc); blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto next; } skip: node = rb_next(&dc->rb_node); if (err) __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); if (fatal_signal_pending(current)) break; } blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); return trimmed; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; if (end < MAIN_BLKADDR(sbi)) goto out; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { f2fs_warn(sbi, "Found FS corruption, run fsck to fix."); return -EFSCORRUPTED; } /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); if (need_align) { start_segno = rounddown(start_segno, SEGS_PER_SEC(sbi)); end_segno = roundup(end_segno + 1, SEGS_PER_SEC(sbi)) - 1; } cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); cpc.trim_start = start_segno; cpc.trim_end = end_segno; if (sbi->discard_blks == 0) goto out; f2fs_down_write(&sbi->gc_lock); stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); f2fs_up_write(&sbi->gc_lock); if (err) goto out; /* * We filed discard candidates, but actually we don't need to wait for * all of them, since they'll be issued in idle time along with runtime * discard option. User configuration looks like using runtime discard * or periodic fstrim instead of it. */ if (f2fs_realtime_discard_enable(sbi)) goto out; start_block = START_BLOCK(sbi, start_segno); end_block = START_BLOCK(sbi, end_segno + 1); __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); trimmed = __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); trimmed += __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); out: if (!err) range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } int f2fs_rw_hint_to_seg_type(enum rw_hint hint) { switch (hint) { case WRITE_LIFE_SHORT: return CURSEG_HOT_DATA; case WRITE_LIFE_EXTREME: return CURSEG_COLD_DATA; default: return CURSEG_WARM_DATA; } } static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } static int __get_segment_type_4(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { if (IS_DNODE(fio->page) && is_cold_node(fio->page)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_info ei = {}; if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { if (!ei.age) return NO_CHECK_TYPE; if (ei.age <= sbi->hot_data_age_threshold) return CURSEG_HOT_DATA; if (ei.age <= sbi->warm_data_age_threshold) return CURSEG_WARM_DATA; return CURSEG_COLD_DATA; } return NO_CHECK_TYPE; } static int __get_segment_type_6(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; int type; if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return CURSEG_COLD_DATA_PINNED; if (page_private_gcing(fio->page)) { if (fio->sbi->am.atgc_enabled && (fio->io_type == FS_DATA_IO) && (fio->sbi->gc_mode != GC_URGENT_HIGH)) return CURSEG_ALL_DATA_ATGC; else return CURSEG_COLD_DATA; } if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; type = __get_age_segment_type(inode, fio->page->index); if (type != NO_CHECK_TYPE) return type; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || f2fs_is_cow_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { if (IS_DNODE(fio->page)) return is_cold_node(fio->page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; return CURSEG_COLD_NODE; } } static int __get_segment_type(struct f2fs_io_info *fio) { int type = 0; switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: type = __get_segment_type_2(fio); break; case 4: type = __get_segment_type_4(fio); break; case 6: type = __get_segment_type_6(fio); break; default: f2fs_bug_on(fio->sbi, true); } if (IS_HOT(type)) fio->temp = HOT; else if (IS_WARM(type)) fio->temp = WARM; else fio->temp = COLD; return type; } static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, struct curseg_info *seg) { /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk > 0) return; seg->fragment_remained_chunk = get_random_u32_inclusive(1, sbi->max_fragment_chunk); seg->next_blkoff += get_random_u32_inclusive(1, sbi->max_fragment_hole); } static void reset_curseg_fields(struct curseg_info *curseg) { curseg->inited = false; curseg->segno = NULL_SEGNO; curseg->next_segno = 0; } int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned long long old_mtime; bool from_gc = (type == CURSEG_ALL_DATA_ATGC); struct seg_entry *se = NULL; bool segment_full = false; int ret = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); if (curseg->segno == NULL_SEGNO) { ret = -ENOSPC; goto out_err; } if (from_gc) { f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO); se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr)); sanity_check_seg_type(sbi, se->type); f2fs_bug_on(sbi, IS_NODESEG(se->type)); } *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); f2fs_bug_on(sbi, curseg->next_blkoff >= BLKS_PER_SEG(sbi)); f2fs_wait_discard_bio(sbi, *new_blkaddr); curseg->sum_blk->entries[curseg->next_blkoff] = *sum; if (curseg->alloc_type == SSR) { curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg); } else { curseg->next_blkoff++; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) f2fs_randomize_chunk(sbi, curseg); } if (curseg->next_blkoff >= f2fs_usable_blks_in_seg(sbi, curseg->segno)) segment_full = true; stat_inc_block_count(sbi, curseg); if (from_gc) { old_mtime = get_segment_mtime(sbi, old_blkaddr); } else { update_segment_mtime(sbi, old_blkaddr, 0); old_mtime = 0; } update_segment_mtime(sbi, *new_blkaddr, old_mtime); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ update_sit_entry(sbi, *new_blkaddr, 1); update_sit_entry(sbi, old_blkaddr, -1); /* * If the current segment is full, flush it out and replace it with a * new segment. */ if (segment_full) { if (type == CURSEG_COLD_DATA_PINNED && !((curseg->segno + 1) % sbi->segs_per_sec)) { reset_curseg_fields(curseg); goto skip_new_segment; } if (from_gc) { ret = get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); } else { if (need_new_seg(sbi, type)) ret = new_curseg(sbi, type, false); else ret = change_curseg(sbi, type); stat_inc_seg_type(sbi, curseg); } if (ret) goto out_err; } skip_new_segment: /* * segment dirty status should be updated after segment allocation, * so we just need to update status only one time after previous * segment being closed. */ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); if (IS_DATASEG(curseg->seg_type)) atomic64_inc(&sbi->allocated_data_blocks); up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(curseg->seg_type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); f2fs_inode_chksum_set(sbi, page); } if (fio) { struct f2fs_bio_info *io; INIT_LIST_HEAD(&fio->list); fio->in_list = 1; io = sbi->write_io[fio->type] + fio->temp; spin_lock(&io->io_lock); list_add_tail(&fio->list, &io->io_list); spin_unlock(&io->io_lock); } mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return 0; out_err: *new_blkaddr = NULL_ADDR; up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr, unsigned int blkcnt) { if (!f2fs_is_multi_device(sbi)) return; while (1) { unsigned int devidx = f2fs_target_device_index(sbi, blkaddr); unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1; /* update device state for fsync */ f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO); /* update device state for checkpoint */ if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { spin_lock(&sbi->dev_lock); f2fs_set_bit(devidx, (char *)&sbi->dirty_device); spin_unlock(&sbi->dev_lock); } if (blkcnt <= blks) break; blkcnt -= blks; blkaddr += blks; } } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio); bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA); if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio)) { if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); if (PageWriteback(fio->page)) end_page_writeback(fio->page); if (f2fs_in_warm_node_list(fio->sbi, fio->page)) f2fs_del_fsync_node_entry(fio->sbi, fio->page); goto out; } if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr); /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); out: if (keep_order) f2fs_up_read(&fio->sbi->io_order_lock); } void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, .type = META, .temp = HOT, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, .in_list = 0, }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; set_page_writeback(page); f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, page->index); f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); } void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) { struct f2fs_summary sum; set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); } void f2fs_outplace_write_data(struct dnode_of_data *dn, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO) f2fs_update_age_extent_cache(dn); set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); } int f2fs_inplace_write_data(struct f2fs_io_info *fio) { int err; struct f2fs_sb_info *sbi = fio->sbi; unsigned int segno; fio->new_blkaddr = fio->old_blkaddr; /* i/o temperature is needed for passing down write hints */ __get_segment_type(fio); segno = GET_SEGNO(sbi, fio->new_blkaddr); if (!IS_DATASEG(get_seg_entry(sbi, segno)->type)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); goto drop_bio; } if (f2fs_cp_error(sbi)) { err = -EIO; goto drop_bio; } if (fio->post_read) f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1); stat_inc_inplace_blocks(fio->sbi); if (fio->bio && !IS_F2FS_IPU_NOCACHE(sbi)) err = f2fs_merge_page_bio(fio); else err = f2fs_submit_page_bio(fio); if (!err) { f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); f2fs_update_iostat(fio->sbi, fio->page->mapping->host, fio->io_type, F2FS_BLKSIZE); } return err; drop_bio: if (fio->bio && *(fio->bio)) { struct bio *bio = *(fio->bio); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); *(fio->bio) = NULL; } return err; } static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, unsigned int segno) { int i; for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { if (CURSEG_I(sbi, i)->segno == segno) break; } return i; } void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr, bool from_gc) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; unsigned int segno, old_cursegno; struct seg_entry *se; int type; unsigned short old_blkoff; unsigned char old_alloc_type; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); type = se->type; f2fs_down_write(&SM_I(sbi)->curseg_lock); if (!recover_curseg) { /* for recovery flow */ if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { if (old_blkaddr == NULL_ADDR) type = CURSEG_COLD_DATA; else type = CURSEG_WARM_DATA; } } else { if (IS_CURSEG(sbi, segno)) { /* se->type is volatile as SSR allocation */ type = __f2fs_get_curseg(sbi, segno); f2fs_bug_on(sbi, type == NO_CHECK_TYPE); } else { type = CURSEG_WARM_DATA; } } f2fs_bug_on(sbi, !IS_DATASEG(type)); curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; old_alloc_type = curseg->alloc_type; /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; if (change_curseg(sbi, type)) goto out_unlock; } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); curseg->sum_blk->entries[curseg->next_blkoff] = *sum; if (!recover_curseg || recover_newaddr) { if (!from_gc) update_segment_mtime(sbi, new_blkaddr, 0); update_sit_entry(sbi, new_blkaddr, 1); } if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { f2fs_invalidate_internal_cache(sbi, old_blkaddr); if (!from_gc) update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); } locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); locate_dirty_segment(sbi, old_cursegno); if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; if (change_curseg(sbi, type)) goto out_unlock; } curseg->next_blkoff = old_blkoff; curseg->alloc_type = old_alloc_type; } out_unlock: up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr) { struct f2fs_summary sum; set_summary(&sum, dn->nid, dn->ofs_in_node, version); f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg, recover_newaddr, false); f2fs_update_data_blkaddr(dn, new_addr); } void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked) { if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); /* submit cached LFS IO */ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); /* submit cached IPU IO */ f2fs_submit_merged_ipu_write(sbi, NULL, page); if (ordered) { wait_on_page_writeback(page); f2fs_bug_on(sbi, locked && PageWriteback(page)); } else { wait_for_stable_page(page); } } } void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *cpage; if (!f2fs_post_read_required(inode)) return; if (!__is_valid_data_blkaddr(blkaddr)) return; cpage = find_lock_page(META_MAPPING(sbi), blkaddr); if (cpage) { f2fs_wait_on_page_writeback(cpage, DATA, true, true); f2fs_put_page(cpage, 1); } } void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); block_t i; if (!f2fs_post_read_required(inode)) return; for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); f2fs_truncate_meta_inode_pages(sbi, blkaddr, len); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; unsigned char *kaddr; struct page *page; block_t start; int i, j, offset; start = start_sum_block(sbi); page = f2fs_get_meta_page(sbi, start++); if (IS_ERR(page)) return PTR_ERR(page); kaddr = (unsigned char *)page_address(page); /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); /* Step 2: restore sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); offset = 2 * SUM_JOURNAL_SIZE; /* Step 3: restore summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blk_off; unsigned int segno; seg_i = CURSEG_I(sbi, i); segno = le32_to_cpu(ckpt->cur_data_segno[i]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]); seg_i->next_segno = segno; reset_curseg(sbi, i, 0); seg_i->alloc_type = ckpt->alloc_type[i]; seg_i->next_blkoff = blk_off; if (seg_i->alloc_type == SSR) blk_off = BLKS_PER_SEG(sbi); for (j = 0; j < blk_off; j++) { struct f2fs_summary *s; s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; if (offset + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; f2fs_put_page(page, 1); page = NULL; page = f2fs_get_meta_page(sbi, start++); if (IS_ERR(page)) return PTR_ERR(page); kaddr = (unsigned char *)page_address(page); offset = 0; } } f2fs_put_page(page, 1); return 0; } static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_summary_block *sum; struct curseg_info *curseg; struct page *new; unsigned short blk_off; unsigned int segno = 0; block_t blk_addr = 0; int err = 0; /* get segment number and block addr */ if (IS_DATASEG(type)) { segno = le32_to_cpu(ckpt->cur_data_segno[type]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - CURSEG_HOT_DATA]); if (__exist_node_summaries(sbi)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type); else blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); } else { segno = le32_to_cpu(ckpt->cur_node_segno[type - CURSEG_HOT_NODE]); blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - CURSEG_HOT_NODE]); if (__exist_node_summaries(sbi)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, type - CURSEG_HOT_NODE); else blk_addr = GET_SUM_BLOCK(sbi, segno); } new = f2fs_get_meta_page(sbi, blk_addr); if (IS_ERR(new)) return PTR_ERR(new); sum = (struct f2fs_summary_block *)page_address(new); if (IS_NODESEG(type)) { if (__exist_node_summaries(sbi)) { struct f2fs_summary *ns = &sum->entries[0]; int i; for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) { ns->version = 0; ns->ofs_in_node = 0; } } else { err = f2fs_restore_node_summary(sbi, segno, sum); if (err) goto out; } } /* set uncompleted segment to curseg */ curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); /* update journal info */ down_write(&curseg->journal_rwsem); memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); up_write(&curseg->journal_rwsem); memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); curseg->next_segno = segno; reset_curseg(sbi, type, 0); curseg->alloc_type = ckpt->alloc_type[type]; curseg->next_blkoff = blk_off; mutex_unlock(&curseg->curseg_mutex); out: f2fs_put_page(new, 1); return err; } static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { int npages = f2fs_npages_for_summary_flush(sbi, true); if (npages >= 2) f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages, META_CP, true); /* restore for compacted data summary */ err = read_compacted_summaries(sbi); if (err) return err; type = CURSEG_HOT_NODE; } if (__exist_node_summaries(sbi)) f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type), NR_CURSEG_PERSIST_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); if (err) return err; } /* sanity check for summary blocks */ if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { f2fs_err(sbi, "invalid journal entries nats %u sits %u", nats_in_cursum(nat_j), sits_in_cursum(sit_j)); return -EINVAL; } return 0; } static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *page; unsigned char *kaddr; struct f2fs_summary *summary; struct curseg_info *seg_i; int written_size = 0; int i, j; page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); memset(kaddr, 0, PAGE_SIZE); /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 2: write sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { seg_i = CURSEG_I(sbi, i); for (j = 0; j < f2fs_curseg_valid_blocks(sbi, i); j++) { if (!page) { page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); memset(kaddr, 0, PAGE_SIZE); written_size = 0; } summary = (struct f2fs_summary *)(kaddr + written_size); *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; if (written_size + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; set_page_dirty(page); f2fs_put_page(page, 1); page = NULL; } } if (page) { set_page_dirty(page); f2fs_put_page(page, 1); } } static void write_normal_summaries(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { int i, end; if (IS_DATASEG(type)) end = type + NR_CURSEG_DATA_TYPE; else end = type + NR_CURSEG_NODE_TYPE; for (i = type; i < end; i++) write_current_sum_page(sbi, i, blkaddr + (i - type)); } void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); } void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; if (type == NAT_JOURNAL) { for (i = 0; i < nats_in_cursum(journal); i++) { if (le32_to_cpu(nid_in_journal(journal, i)) == val) return i; } if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) return update_nats_in_cursum(journal, 1); } else if (type == SIT_JOURNAL) { for (i = 0; i < sits_in_cursum(journal); i++) if (le32_to_cpu(segno_in_journal(journal, i)) == val) return i; if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) return update_sits_in_cursum(journal, 1); } return -1; } static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); struct page *page; pgoff_t src_off, dst_off; src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); page = f2fs_grab_meta_page(sbi, dst_off); seg_info_to_sit_page(sbi, page, start); set_page_dirty(page); set_to_next_sit(sit_i, start); return page; } static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS, true, NULL); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); return ses; } static void release_sit_entry_set(struct sit_entry_set *ses) { list_del(&ses->set_list); kmem_cache_free(sit_entry_set_slab, ses); } static void adjust_sit_entry_set(struct sit_entry_set *ses, struct list_head *head) { struct sit_entry_set *next = ses; if (list_is_last(&ses->set_list, head)) return; list_for_each_entry_continue(next, head, set_list) if (ses->entry_cnt <= next->entry_cnt) { list_move_tail(&ses->set_list, &next->set_list); return; } list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) { struct sit_entry_set *ses; unsigned int start_segno = START_SEGNO(segno); list_for_each_entry(ses, head, set_list) { if (ses->start_segno == start_segno) { ses->entry_cnt++; adjust_sit_entry_set(ses, head); return; } } ses = grab_sit_entry_set(); ses->start_segno = start_segno; ses->entry_cnt++; list_add(&ses->set_list, head); } static void add_sits_in_set(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); struct list_head *set_list = &sm_info->sit_entry_set; unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap; unsigned int segno; for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi)) add_sit_entry(segno, set_list); } static void remove_sits_in_journal(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; int i; down_write(&curseg->journal_rwsem); for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int segno; bool dirtied; segno = le32_to_cpu(segno_in_journal(journal, i)); dirtied = __mark_sit_entry_dirty(sbi, segno); if (!dirtied) add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); } update_sits_in_cursum(journal, -i); up_write(&curseg->journal_rwsem); } /* * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; struct sit_entry_set *ses, *tmp; struct list_head *head = &SM_I(sbi)->sit_entry_set; bool to_journal = !is_sbi_flag_set(sbi, SBI_IS_RESIZEFS); struct seg_entry *se; down_write(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) goto out; /* * add and account sit entries of dirty bitmap in sit entry * set temporarily */ add_sits_in_set(sbi); /* * if there are no enough space in journal to store dirty sit * entries, remove all entries from journal and add and account * them in sit entry set. */ if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) || !to_journal) remove_sits_in_journal(sbi); /* * there are two steps to flush sit entries: * #1, flush sit entries to journal in current cold data summary block. * #2, flush sit entries to sit page. */ list_for_each_entry_safe(ses, tmp, head, set_list) { struct page *page = NULL; struct f2fs_sit_block *raw_sit = NULL; unsigned int start_segno = ses->start_segno; unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, (unsigned long)MAIN_SEGS(sbi)); unsigned int segno = start_segno; if (to_journal && !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) to_journal = false; if (to_journal) { down_write(&curseg->journal_rwsem); } else { page = get_next_sit_page(sbi, start_segno); raw_sit = page_address(page); } /* flush dirty sit entries in region of current sit set */ for_each_set_bit_from(segno, bitmap, end) { int offset, sit_offset; se = get_seg_entry(sbi, segno); #ifdef CONFIG_F2FS_CHECK_FS if (memcmp(se->cur_valid_map, se->cur_valid_map_mir, SIT_VBLOCK_MAP_SIZE)) f2fs_bug_on(sbi, 1); #endif /* add discard candidates */ if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } if (to_journal) { offset = f2fs_lookup_journal_in_cursum(journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); segno_in_journal(journal, offset) = cpu_to_le32(segno); seg_info_to_raw_sit(se, &sit_in_journal(journal, offset)); check_block_count(sbi, segno, &sit_in_journal(journal, offset)); } else { sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); check_block_count(sbi, segno, &raw_sit->entries[sit_offset]); } __clear_bit(segno, bitmap); sit_i->dirty_sentries--; ses->entry_cnt--; } if (to_journal) up_write(&curseg->journal_rwsem); else f2fs_put_page(page, 1); f2fs_bug_on(sbi, ses->entry_cnt); release_sit_entry_set(ses); } f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) add_discard_addrs(sbi, cpc, false); cpc->trim_start = trim_start; } up_write(&sit_i->sentry_lock); set_prefree_as_free_segments(sbi); } static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *bitmap; unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size; unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0; /* allocate memory for SIT information */ sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); if (!sit_i) return -ENOMEM; SM_I(sbi)->sit_info = sit_i; sit_i->sentries = f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry), MAIN_SEGS(sbi)), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; main_bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, main_bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map); #else bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map); #endif sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!sit_i->bitmap) return -ENOMEM; bitmap = sit_i->bitmap; for (start = 0; start < MAIN_SEGS(sbi); start++) { sit_i->sentries[start].cur_valid_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; sit_i->sentries[start].ckpt_valid_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; #ifdef CONFIG_F2FS_CHECK_FS sit_i->sentries[start].cur_valid_map_mir = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; #endif if (discard_map) { sit_i->sentries[start].discard_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; } } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->tmp_map) return -ENOMEM; if (__is_large_section(sbi)) { sit_i->sec_entries = f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), MAIN_SECS(sbi)), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } /* get information related with SIT */ sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1; /* setup SIT bitmap from ckeckpoint pack */ sit_bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); sit_i->sit_bitmap = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL); if (!sit_i->sit_bitmap) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS sit_i->sit_bitmap_mir = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL); if (!sit_i->sit_bitmap_mir) return -ENOMEM; sit_i->invalid_segmap = f2fs_kvzalloc(sbi, main_bitmap_size, GFP_KERNEL); if (!sit_i->invalid_segmap) return -ENOMEM; #endif sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = SEGS_TO_BLKS(sbi, sit_segs); sit_i->written_valid_blocks = 0; sit_i->bitmap_size = sit_bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); sit_i->mounted_time = ktime_get_boottime_seconds(); init_rwsem(&sit_i->sentry_lock); return 0; } static int build_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i; unsigned int bitmap_size, sec_bitmap_size; /* allocate memory for free segmap information */ free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL); if (!free_i) return -ENOMEM; SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; /* set all segments as dirty temporarily */ memset(free_i->free_segmap, 0xff, bitmap_size); memset(free_i->free_secmap, 0xff, sec_bitmap_size); /* init free segmap information */ free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); free_i->free_segments = 0; free_i->free_sections = 0; spin_lock_init(&free_i->segmap_lock); return 0; } static int build_curseg(struct f2fs_sb_info *sbi) { struct curseg_info *array; int i; array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), GFP_KERNEL); if (!array) return -ENOMEM; SM_I(sbi)->curseg_array = array; for (i = 0; i < NO_CHECK_TYPE; i++) { mutex_init(&array[i].curseg_mutex); array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; init_rwsem(&array[i].journal_rwsem); array[i].journal = f2fs_kzalloc(sbi, sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; if (i < NR_PERSISTENT_LOG) array[i].seg_type = CURSEG_HOT_DATA + i; else if (i == CURSEG_COLD_DATA_PINNED) array[i].seg_type = CURSEG_COLD_DATA; else if (i == CURSEG_ALL_DATA_ATGC) array[i].seg_type = CURSEG_COLD_DATA; reset_curseg_fields(&array[i]); } return restore_curseg_summaries(sbi); } static int build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; struct seg_entry *se; struct f2fs_sit_entry sit; int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; block_t sit_valid_blocks[2] = {0, 0}; do { readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; for (; start < end && start < MAIN_SEGS(sbi); start++) { struct f2fs_sit_block *sit_blk; struct page *page; se = &sit_i->sentries[start]; page = get_current_sit_page(sbi, start); if (IS_ERR(page)) return PTR_ERR(page); sit_blk = (struct f2fs_sit_block *)page_address(page); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); err = check_block_count(sbi, start, &sit); if (err) return err; seg_info_from_raw_sit(se, &sit); if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); return -EFSCORRUPTED; } sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (!f2fs_block_unit_discard(sbi)) goto init_discard_map_done; /* build discard map only one time */ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); goto init_discard_map_done; } memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); sbi->discard_blks += BLKS_PER_SEG(sbi) - se->valid_blocks; init_discard_map_done: if (__is_large_section(sbi)) get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; } start_blk += readed; } while (start_blk < sit_blk_cnt); down_read(&curseg->journal_rwsem); for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int old_valid_blocks; start = le32_to_cpu(segno_in_journal(journal, i)); if (start >= MAIN_SEGS(sbi)) { f2fs_err(sbi, "Wrong journal entry on segno %u", start); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); break; } se = &sit_i->sentries[start]; sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); break; } sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); } else { memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); sbi->discard_blks += old_valid_blocks; sbi->discard_blks -= se->valid_blocks; } } if (__is_large_section(sbi)) { get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; get_sec_entry(sbi, start)->valid_blocks -= old_valid_blocks; } } up_read(&curseg->journal_rwsem); if (err) return err; if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", sit_valid_blocks[NODE], valid_node_count(sbi)); f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); return -EFSCORRUPTED; } if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > valid_user_blocks(sbi)) { f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", sit_valid_blocks[DATA], sit_valid_blocks[NODE], valid_user_blocks(sbi)); f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); return -EFSCORRUPTED; } return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) { unsigned int start; int type; struct seg_entry *sentry; for (start = 0; start < MAIN_SEGS(sbi); start++) { if (f2fs_usable_blks_in_seg(sbi, start) == 0) continue; sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); else SIT_I(sbi)->written_valid_blocks += sentry->valid_blocks; } /* set use the current segments */ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { struct curseg_info *curseg_t = CURSEG_I(sbi, type); __set_test_and_inuse(sbi, curseg_t->segno); } } static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; block_t valid_blocks, usable_blks_in_seg; while (1) { /* find dirty segment based on free segmap */ segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset); if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, false); usable_blks_in_seg = f2fs_usab |