| 4 1 3 15 1 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | // SPDX-License-Identifier: GPL-2.0-only /* * TTL modification target for IP tables * (C) 2000,2005 by Harald Welte <laforge@netfilter.org> * * Hop Limit modification target for ip6tables * Maciej Soltysiak <solt@dns.toxicfilms.tv> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/checksum.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ipt_TTL.h> #include <linux/netfilter_ipv6/ip6t_HL.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); MODULE_DESCRIPTION("Xtables: Hoplimit/TTL Limit field modification target"); MODULE_LICENSE("GPL"); static unsigned int ttl_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct iphdr *iph; const struct ipt_TTL_info *info = par->targinfo; int new_ttl; if (skb_ensure_writable(skb, sizeof(*iph))) return NF_DROP; iph = ip_hdr(skb); switch (info->mode) { case IPT_TTL_SET: new_ttl = info->ttl; break; case IPT_TTL_INC: new_ttl = iph->ttl + info->ttl; if (new_ttl > 255) new_ttl = 255; break; case IPT_TTL_DEC: new_ttl = iph->ttl - info->ttl; if (new_ttl < 0) new_ttl = 0; break; default: new_ttl = iph->ttl; break; } if (new_ttl != iph->ttl) { csum_replace2(&iph->check, htons(iph->ttl << 8), htons(new_ttl << 8)); iph->ttl = new_ttl; } return XT_CONTINUE; } static unsigned int hl_tg6(struct sk_buff *skb, const struct xt_action_param *par) { struct ipv6hdr *ip6h; const struct ip6t_HL_info *info = par->targinfo; int new_hl; if (skb_ensure_writable(skb, sizeof(*ip6h))) return NF_DROP; ip6h = ipv6_hdr(skb); switch (info->mode) { case IP6T_HL_SET: new_hl = info->hop_limit; break; case IP6T_HL_INC: new_hl = ip6h->hop_limit + info->hop_limit; if (new_hl > 255) new_hl = 255; break; case IP6T_HL_DEC: new_hl = ip6h->hop_limit - info->hop_limit; if (new_hl < 0) new_hl = 0; break; default: new_hl = ip6h->hop_limit; break; } ip6h->hop_limit = new_hl; return XT_CONTINUE; } static int ttl_tg_check(const struct xt_tgchk_param *par) { const struct ipt_TTL_info *info = par->targinfo; if (info->mode > IPT_TTL_MAXMODE) return -EINVAL; if (info->mode != IPT_TTL_SET && info->ttl == 0) return -EINVAL; return 0; } static int hl_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_HL_info *info = par->targinfo; if (info->mode > IP6T_HL_MAXMODE) return -EINVAL; if (info->mode != IP6T_HL_SET && info->hop_limit == 0) return -EINVAL; return 0; } static struct xt_target hl_tg_reg[] __read_mostly = { { .name = "TTL", .revision = 0, .family = NFPROTO_IPV4, .target = ttl_tg, .targetsize = sizeof(struct ipt_TTL_info), .table = "mangle", .checkentry = ttl_tg_check, .me = THIS_MODULE, }, { .name = "HL", .revision = 0, .family = NFPROTO_IPV6, .target = hl_tg6, .targetsize = sizeof(struct ip6t_HL_info), .table = "mangle", .checkentry = hl_tg6_check, .me = THIS_MODULE, }, }; static int __init hl_tg_init(void) { return xt_register_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); } static void __exit hl_tg_exit(void) { xt_unregister_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); } module_init(hl_tg_init); module_exit(hl_tg_exit); MODULE_ALIAS("ipt_TTL"); MODULE_ALIAS("ip6t_HL"); |
| 23 23 2 1 23 3 23 1 1 23 23 18 18 47 29 29 25 29 9 8 9 9 1 5 28 3 26 1 6 1 7 1 42 42 42 12 1 14 21 8 5 5 5 47 29 28 12 1 15 25 9 8 8 4 6 15 15 3 7 2 1 1 1 15 2 1 1 2 1 1 11 1 1 47 47 1 97 131 36 4 16 20 4 16 4 2 6 4 51 1 50 13 13 37 50 50 49 45 4 3 1 48 47 1 2 1 49 8 8 3 2 3 2 1 4 4 1 2 5 3 62 6 9 2 44 9 1 8 5 7 9 51 50 41 39 39 39 2 2 7 1 42 7 2 5 5 5 3 2 3 3 3 3 3 3 3 3 3 46 2 1 45 3 40 1 2 3 41 41 44 52 52 6 44 49 67 1 66 3 64 67 57 9 7 3 3 3 25 17 1 1 1 1 14 8 3 2 1 3 2 1 1 1 1 2 9 8 2 8 9 3 421 13 13 4 76 2 62 64 64 64 64 60 4 34 26 8 30 1 19 10 19 10 22 7 28 1 28 1 28 1 28 1 22 7 24 5 28 1 31 31 29 2 5 25 15 6 2 1 1 1 3 1 2 172 338 4 2 3 2 2 2 1 1 3 2 2 334 4 417 37 37 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 tunneling device * Linux INET6 implementation * * Authors: * Ville Nuorvala <vnuorval@tcs.hut.fi> * Yasuyuki Kozakai <kozakai@linux-ipv6.org> * * Based on: * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c * * RFC 2473 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/etherdevice.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #define IP6_TUNNEL_HASH_SIZE_SHIFT 5 #define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); } static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); static struct rtnl_link_ops ip6_link_ops __read_mostly; static unsigned int ip6_tnl_net_id __read_mostly; struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; struct ip6_tnl __rcu *collect_md_tun; }; static inline int ip6_tnl_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } #define for_each_ip6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * ip6_tnl_lookup(struct net *net, int link, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_equal(remote, &t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_any(&t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(remote, &t->parms.raddr) || !ipv6_addr_any(&t->parms.laddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * ip6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * ip6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } /** * ip6_tnl_link - add tunnel to hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be added **/ static void ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, t); rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } /** * ip6_tnl_unlink - remove tunnel from hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be removed **/ static void ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, NULL); for (tp = ip6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); } static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); ip6_tnl_link(ip6n, t); return 0; out: return err; } /** * ip6_tnl_create - create a new tunnel * @net: network namespace * @p: tunnel parameters * * Description: * Create tunnel matching given parameters. * * Return: * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err = -E2BIG; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6tnl%%d"); } err = -ENOMEM; dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = ip6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } /** * ip6_tnl_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * ip6_tnl_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); for (tp = ip6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); return t; } } if (!create) return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } /** * ip6_tnl_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * ip6_tnl_dev_uninit() removes tunnel from its list **/ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); dst_cache_reset(&t->dst_cache); netdev_put(dev, &t->dev_tracker); } /** * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option * @skb: received socket buffer * @raw: the ICMPv6 error message data * * Return: * 0 if none was found, * else index to encapsulation limit **/ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { optlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { optlen = ipv6_authlen(hdr); } else { optlen = ipv6_optlen(hdr); } if (!pskb_may_pull(skb, off + optlen)) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr; if (frag_hdr->frag_off) break; } if (nexthdr == NEXTHDR_DEST) { u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; /* No more room for encapsulation limit */ if (i + sizeof(*tel) > optlen) break; tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i); /* return index of option if found and valid */ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && tel->length == 1) return i + off - nhoff; /* else jump to next option */ if (tel->type) i += tel->length + 2; else i++; } } nexthdr = hdr->nexthdr; off += optlen; } return 0; } EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); /* ip6_tnl_err() should handle errors in the tunnel according to the * specifications in RFC 2473. */ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; __u32 rel_info = 0; struct ip6_tnl *t; int err = -ENOENT; int rel_msg = 0; u8 tproto; __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further processing of the error. */ rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto out; err = 0; switch (*type) { case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } break; case ICMPV6_PARAMPROB: { struct ipv6_tlv_tnl_enc_lim *tel; __u32 teli; teli = 0; if ((*code) == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } } else { net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } break; } case ICMPV6_PKT_TOOBIG: { __u32 mtu; ip6_update_pmtu(skb, net, htonl(*info), 0, 0, sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; rel_code = 0; rel_info = mtu; rel_msg = 1; } break; } case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); break; } *type = rel_type; *code = rel_code; *info = rel_info; *msg = rel_msg; out: rcu_read_unlock(); return err; } static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); const struct iphdr *eiph; struct sk_buff *skb2; int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg == 0) return 0; switch (rel_type) { case ICMPV6_DEST_UNREACH: if (rel_code != ICMPV6_ADDR_UNREACH) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; break; case ICMPV6_PKT_TOOBIG: if (rel_code != 0) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; default: return 0; } if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) return 0; skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); eiph = ip_hdr(skb2); /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, 0, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, ip4h_dscp(eiph), skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) goto out; } /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { if (rel_info > dst_mtu(skb_dst(skb2))) goto out; skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); out: kfree_skb(skb2); return 0; } static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { struct rt6_info *rt; struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; icmpv6_send(skb2, rel_type, rel_code, rel_info); ip6_rt_put(rt); kfree_skb(skb2); } return 0; } static int mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); return err; } static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); return IP6_ECN_decapsulate(ipv6h, skb); } static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); return IP6_ECN_decapsulate(ipv6h, skb); } static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { /* ECN is not supported in AF_MPLS */ return 0; } __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ltype = ipv6_addr_type(laddr); int rtype = ipv6_addr_type(raddr); __u32 flags = 0; if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { flags = IP6_TNL_F_CAP_PER_PACKET; } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { if (ltype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_XMIT; if (rtype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_RCV; } return flags; } EXPORT_SYMBOL(ip6_tnl_get_cap); /* called with rcu_read_lock() */ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if ((p->flags & IP6_TNL_F_CAP_RCV) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { struct net_device *ldev = NULL; if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if ((ipv6_addr_is_multicast(laddr) || likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) && ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE)))) ret = 1; } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb), bool log_ecn_err) { const struct ipv6hdr *ipv6h; int nh, err; if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } skb->protocol = tpi->proto; /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; skb_reset_mac_header(skb); } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } /* Get the outer header. */ ipv6h = (struct ipv6hdr *)(skb->head + nh); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); if (unlikely(err)) { if (log_ecn_err) net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", &ipv6h->saddr, ipv6_get_dsfield(ipv6h)); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_err) { int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb); dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate; if (tpi->proto == htons(ETH_P_IP)) dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate; return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); static const struct tnl_ptk_info tpi_v6 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IPV6), }; static const struct tnl_ptk_info tpi_v4 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IP), }; static const struct tnl_ptk_info tpi_mpls = { /* no tunnel info required for mplsip6. */ .proto = htons(ETH_P_MPLS_UC), }; static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb)) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct metadata_dst *tun_dst = NULL; int ret = -1; rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); } rcu_read_unlock(); return ret; drop: rcu_read_unlock(); kfree_skb(skb); return 0; } static int ip4ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, ip4ip6_dscp_ecn_decapsulate); } static int ip6ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, ip6ip6_dscp_ecn_decapsulate); } static int mplsip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, mplsip6_dscp_ecn_decapsulate); } struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; }; static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) { memset(opt, 0, sizeof(struct ipv6_tel_txoption)); opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; opt->dst_opt[3] = 1; opt->dst_opt[4] = encap_limit; opt->dst_opt[5] = IPV6_TLV_PADN; opt->dst_opt[6] = 1; opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt; opt->ops.opt_nflen = 8; } /** * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if (t->parms.collect_md) return 1; if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { struct net_device *ldev = NULL; rcu_read_lock(); if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else ret = 1; rcu_read_unlock(); } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); /** * ip6_tnl_xmit - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @dsfield: dscp code for outer header * @fl6: flow of tunneled packet * @encap_limit: encapsulation limit * @pmtu: Path MTU is stored if packet is too big * @proto: next header value * * Description: * Build new header and do some sanity checks on the packet before sending * it. * * Return: * 0 on success * -1 fail * %-EMSGSIZE message too big. return mtu in this case. **/ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ipv6hdr *ipv6h; struct ipv6_tel_txoption opt; struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; __be16 payload_protocol; bool use_cache = false; u8 hop_limit; int err = -1; payload_protocol = skb_protocol(skb, true); if (t->parms.collect_md) { hop_limit = skb_tunnel_info(skb)->key.ttl; goto route_lookup; } else { hop_limit = t->parms.hop_limit; } /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { if (payload_protocol == htons(ETH_P_IPV6)) { struct in6_addr *addr6; struct neighbour *neigh; int addr_type; if (!skb_dst(skb)) goto tx_err_link_failure; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_err_link_failure; addr6 = (struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) addr6 = &ipv6_hdr(skb)->daddr; memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (payload_protocol == htons(ETH_P_IP)) { const struct rtable *rt = skb_rtable(skb); if (!rt) goto tx_err_link_failure; if (rt->rt_gw_family == AF_INET6) memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr)); } } else if (t->parms.proto != 0 && !(t->parms.flags & (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { /* enable the cache only if neither the outer protocol nor the * routing decision depends on the current inner header value */ use_cache = true; } if (use_cache) dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { route_lookup: /* add dsfield to flowlabel for route lookup */ fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel); dst = ip6_route_output(net, NULL, fl6); if (dst->error) goto tx_err_link_failure; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) && ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, &fl6->daddr, 0, &fl6->saddr)) goto tx_err_link_failure; ndst = dst; } tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; } mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } if (t->err_count > 0) { if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) { t->err_count--; dst_link_failure(skb); } else { t->err_count = 0; } } skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom += LL_RESERVED_SPACE(tdev); if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb; new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) goto tx_err_dst_release; if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } if (t->parms.collect_md) { if (t->encap.type != TUNNEL_ENCAP_NONE) goto tx_err_dst_release; } else { if (use_cache && ndst) dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); } skb_dst_set(skb, dst); if (hop_limit == 0) { if (payload_protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) hop_limit = ipv6_hdr(skb)->hop_limit; else hop_limit = ip6_dst_hoplimit(dst); } /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } EXPORT_SYMBOL(ip6_tnl_xmit); static inline int ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; __u16 offset; struct flowi6 fl6; __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; tproto = READ_ONCE(t->parms.proto); if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET6)) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); break; default: orig_dsfield = dsfield; break; } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; if (protocol == IPPROTO_IPV6) { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have * reallocated skb->head */ if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (void *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; } } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = protocol; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); break; default: orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); break; } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; skb_set_inner_ipproto(skb, protocol); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) switch (protocol) { case IPPROTO_IPIP: icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); break; case IPPROTO_IPV6: icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); break; default: break; } return -1; } return 0; } static netdev_tx_t ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; ipproto = IPPROTO_IPV6; break; case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; default: goto tx_err; } ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; int t_hlen; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ fl6->saddr = p->laddr; fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) { tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { dev->needed_headroom = tdev->hard_header_len + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) mtu -= 8; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); } } } /** * ip6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * * Description: * ip6_tnl_change() updates the tunnel parameters **/ static void ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); } static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); ip6_tnl_unlink(ip6n, t); synchronize_net(); ip6_tnl_change(t, p); ip6_tnl_link(ip6n, t); netdev_state_change(t->dev); } static void ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { /* for default tnl0 device allow to change only the proto */ t->parms.proto = p->proto; netdev_state_change(t->dev); } static void ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->flags = u->flags; p->hop_limit = u->hop_limit; p->encap_limit = u->encap_limit; p->flowinfo = u->flowinfo; p->link = u->link; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->flags = p->flags; u->hop_limit = p->hop_limit; u->encap_limit = p->encap_limit; u->flowinfo = p->flowinfo; u->link = p->link; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * ip6_tnl_ioctl() is used for managing IPv6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6tnl0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && p.proto != 0) break; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); if (dev == ip6n->fb_tnl_dev) ip6_tnl0_update(t, &p1); else ip6_tnl_update(t, &p1); } if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { err = PTR_ERR(t); } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } /** * ip6_tnl_change_mtu - change mtu manually for tunnel device * @dev: virtual device associated with tunnel * @new_mtu: the new mtu * * Return: * 0 on success, * %-EINVAL if mtu too small **/ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); int t_hlen; t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip6_tnl_encap_add_ops); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip6_tnl_encap_del_ops); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip6_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; #define IPXIPX_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void ip6_tnl_dev_setup(struct net_device *dev) { dev->netdev_ops = &ip6_tnl_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); dev->features |= IPXIPX_FEATURES; dev->hw_features |= IPXIPX_FEATURES; /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * ip6_tnl_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int ret; int t_hlen; t->dev = dev; ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) return ret; ret = gro_cells_init(&t->gro_cells, dev); if (ret) goto destroy_dst; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; destroy_dst: dst_cache_destroy(&t->dst_cache); return ret; } /** * ip6_tnl_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = ip6_tnl_dev_init_gen(dev); if (err) return err; ip6_tnl_link_config(t); if (t->parms.collect_md) netif_keep_dst(dev); return 0; } /** * ip6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPV6 && proto != IPPROTO_IPIP && proto != 0) return -EINVAL; return 0; } static void ip6_tnl_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); if (data[IFLA_IPTUN_ENCAP_LIMIT]) parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); if (data[IFLA_IPTUN_FLOWINFO]) parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); if (data[IFLA_IPTUN_FLAGS]) parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); if (data[IFLA_IPTUN_PROTO]) parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (data[IFLA_IPTUN_COLLECT_METADATA]) parms->collect_md = true; if (data[IFLA_IPTUN_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ip6_tnl_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip6_tnl_net *ip6n; struct ip6_tnl *nt, *t; struct net *net; int err; net = params->link_net ? : dev_net(dev); ip6n = net_generic(net, ip6_tnl_net_id); nt = netdev_priv(dev); nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &nt->parms); if (nt->parms.collect_md) { if (rtnl_dereference(ip6n->collect_md_tun)) return -EEXIST; } else { t = ip6_tnl_locate(net, &nt->parms, 0); if (!IS_ERR(t)) return -EEXIST; } err = ip6_tnl_create2(dev); if (!err && tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return err; } static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; if (dev == ip6n->fb_tnl_dev) return -EINVAL; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &p); if (p.collect_md) return -EINVAL; t = ip6_tnl_locate(net, &p, 0); if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ip6_tnl_update(t, &p); return 0; } static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static size_t ip6_tnl_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_LIMIT */ nla_total_size(1) + /* IFLA_IPTUN_FLOWINFO */ nla_total_size(4) + /* IFLA_IPTUN_FLAGS */ nla_total_size(4) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (parm->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { .kind = "ip6tnl", .maxtype = IFLA_IPTUN_MAX, .policy = ip6_tnl_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6_tnl_dev_setup, .validate = ip6_tnl_validate, .newlink = ip6_tnl_newlink, .changelink = ip6_tnl_changelink, .dellink = ip6_tnl_dellink, .get_size = ip6_tnl_get_size, .fill_info = ip6_tnl_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, }; static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .handler = mplsip6_rcv, .err_handler = mplsip6_err, .priority = 1, }; static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } t = rtnl_dereference(ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } static int __net_init ip6_tnl_init_net(struct net *net) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ ip6n->fb_tnl_dev->netns_immutable = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) ip6_tnl_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; /** * ip6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init ip6_tunnel_init(void) { int err; if (!ipv6_mod_enabled()) return -EOPNOTSUPP; err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); if (err < 0) { pr_err("%s: can't register ip4ip6\n", __func__); goto out_ip4ip6; } err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); if (err < 0) { pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } if (ip6_tnl_mpls_supported()) { err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); if (err < 0) { pr_err("%s: can't register mplsip6\n", __func__); goto out_mplsip6; } } err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: if (ip6_tnl_mpls_supported()) xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); out_ip4ip6: unregister_pernet_device(&ip6_tnl_net_ops); out_pernet: return err; } /** * ip6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit ip6_tunnel_cleanup(void) { rtnl_link_unregister(&ip6_link_ops); if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) pr_info("%s: can't deregister ip4ip6\n", __func__); if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); if (ip6_tnl_mpls_supported() && xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } module_init(ip6_tunnel_init); module_exit(ip6_tunnel_cleanup); |
| 17 17 2 2 2 2 2 2 2 2 3 3 6 6 9 3 6 6 6 6 6 3 1 1 2 7 9 3 7 10 9 10 1 1 10 10 14 14 6 14 14 12 13 5 9 7 14 2 11 7 3 3 6 12 1 4 7 3 3 1 3 1 17 1 15 13 3 3 2 14 18 17 18 14 14 6 8 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 | // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Frame router for HSR and PRP. */ #include "hsr_forward.h" #include <linux/types.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/if_vlan.h> #include "hsr_main.h" #include "hsr_framereg.h" struct hsr_node; /* The uses I can see for these HSR supervision frames are: * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type = * 22") to reset any sequence_nr counters belonging to that node. Useful if * the other node's counter has been reset for some reason. * -- * Or not - resetting the counter and bridging the frame would create a * loop, unfortunately. * * 2) Use the LifeCheck frames to detect ring breaks. I.e. if no LifeCheck * frame is received from a particular node, we know something is wrong. * We just register these (as with normal frames) and throw them away. * * 3) Allow different MAC addresses for the two slave interfaces, using the * MacAddressA field. */ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) { struct ethhdr *eth_hdr; struct hsr_sup_tag *hsr_sup_tag; struct hsrv1_ethhdr_sp *hsr_V1_hdr; struct hsr_sup_tlv *hsr_sup_tlv; u16 total_length = 0; WARN_ON_ONCE(!skb_mac_header_was_set(skb)); eth_hdr = (struct ethhdr *)skb_mac_header(skb); /* Correct addr? */ if (!ether_addr_equal(eth_hdr->h_dest, hsr->sup_multicast_addr)) return false; /* Correct ether type?. */ if (!(eth_hdr->h_proto == htons(ETH_P_PRP) || eth_hdr->h_proto == htons(ETH_P_HSR))) return false; /* Get the supervision header from correct location. */ if (eth_hdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */ total_length = sizeof(struct hsrv1_ethhdr_sp); if (!pskb_may_pull(skb, total_length)) return false; hsr_V1_hdr = (struct hsrv1_ethhdr_sp *)skb_mac_header(skb); if (hsr_V1_hdr->hsr.encap_proto != htons(ETH_P_PRP)) return false; hsr_sup_tag = &hsr_V1_hdr->hsr_sup; } else { total_length = sizeof(struct hsrv0_ethhdr_sp); if (!pskb_may_pull(skb, total_length)) return false; hsr_sup_tag = &((struct hsrv0_ethhdr_sp *)skb_mac_header(skb))->hsr_sup; } if (hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_ANNOUNCE && hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_LIFE_CHECK && hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DD && hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DA) return false; if (hsr_sup_tag->tlv.HSR_TLV_length != 12 && hsr_sup_tag->tlv.HSR_TLV_length != sizeof(struct hsr_sup_payload)) return false; /* Get next tlv */ total_length += hsr_sup_tag->tlv.HSR_TLV_length; if (!pskb_may_pull(skb, total_length)) return false; skb_pull(skb, total_length); hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; skb_push(skb, total_length); /* if this is a redbox supervision frame we need to verify * that more data is available */ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) { /* tlv length must be a length of a mac address */ if (hsr_sup_tlv->HSR_TLV_length != sizeof(struct hsr_sup_payload)) return false; /* make sure another tlv follows */ total_length += sizeof(struct hsr_sup_tlv) + hsr_sup_tlv->HSR_TLV_length; if (!pskb_may_pull(skb, total_length)) return false; /* get next tlv */ skb_pull(skb, total_length); hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; skb_push(skb, total_length); } /* end of tlvs must follow at the end */ if (hsr_sup_tlv->HSR_TLV_type == HSR_TLV_EOT && hsr_sup_tlv->HSR_TLV_length != 0) return false; return true; } static bool is_proxy_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) { struct hsr_sup_payload *payload; struct ethhdr *eth_hdr; u16 total_length = 0; eth_hdr = (struct ethhdr *)skb_mac_header(skb); /* Get the HSR protocol revision. */ if (eth_hdr->h_proto == htons(ETH_P_HSR)) total_length = sizeof(struct hsrv1_ethhdr_sp); else total_length = sizeof(struct hsrv0_ethhdr_sp); if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_payload))) return false; skb_pull(skb, total_length); payload = (struct hsr_sup_payload *)skb->data; skb_push(skb, total_length); /* For RedBox (HSR-SAN) check if we have received the supervision * frame with MAC addresses from own ProxyNodeTable. */ return hsr_is_node_in_db(&hsr->proxy_node_db, payload->macaddress_A); } static struct sk_buff *create_stripped_skb_hsr(struct sk_buff *skb_in, struct hsr_frame_info *frame) { struct sk_buff *skb; int copylen; unsigned char *dst, *src; skb_pull(skb_in, HSR_HLEN); skb = __pskb_copy(skb_in, skb_headroom(skb_in) - HSR_HLEN, GFP_ATOMIC); skb_push(skb_in, HSR_HLEN); if (!skb) return NULL; skb_reset_mac_header(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start -= HSR_HLEN; copylen = 2 * ETH_ALEN; if (frame->is_vlan) copylen += VLAN_HLEN; src = skb_mac_header(skb_in); dst = skb_mac_header(skb); memcpy(dst, src, copylen); skb->protocol = eth_hdr(skb)->h_proto; return skb; } struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { if (!frame->skb_std) { if (frame->skb_hsr) frame->skb_std = create_stripped_skb_hsr(frame->skb_hsr, frame); else netdev_warn_once(port->dev, "Unexpected frame received in hsr_get_untagged_frame()\n"); if (!frame->skb_std) return NULL; } return skb_clone(frame->skb_std, GFP_ATOMIC); } struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { if (!frame->skb_std) { if (frame->skb_prp) { /* trim the skb by len - HSR_HLEN to exclude RCT */ skb_trim(frame->skb_prp, frame->skb_prp->len - HSR_HLEN); frame->skb_std = __pskb_copy(frame->skb_prp, skb_headroom(frame->skb_prp), GFP_ATOMIC); } else { /* Unexpected */ WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n", __FILE__, __LINE__, port->dev->name); return NULL; } } return skb_clone(frame->skb_std, GFP_ATOMIC); } static void prp_set_lan_id(struct prp_rct *trailer, struct hsr_port *port) { int lane_id; if (port->type == HSR_PT_SLAVE_A) lane_id = 0; else lane_id = 1; /* Add net_id in the upper 3 bits of lane_id */ lane_id |= port->hsr->net_id; set_prp_lan_id(trailer, lane_id); } /* Tailroom for PRP rct should have been created before calling this */ static struct sk_buff *prp_fill_rct(struct sk_buff *skb, struct hsr_frame_info *frame, struct hsr_port *port) { struct prp_rct *trailer; int min_size = ETH_ZLEN; int lsdu_size; if (!skb) return skb; if (frame->is_vlan) min_size = VLAN_ETH_ZLEN; if (skb_put_padto(skb, min_size)) return NULL; trailer = (struct prp_rct *)skb_put(skb, HSR_HLEN); lsdu_size = skb->len - 14; if (frame->is_vlan) lsdu_size -= 4; prp_set_lan_id(trailer, port); set_prp_LSDU_size(trailer, lsdu_size); trailer->sequence_nr = htons(frame->sequence_nr); trailer->PRP_suffix = htons(ETH_P_PRP); skb->protocol = eth_hdr(skb)->h_proto; return skb; } static void hsr_set_path_id(struct hsr_ethhdr *hsr_ethhdr, struct hsr_port *port) { int path_id; if (port->type == HSR_PT_SLAVE_A) path_id = 0; else path_id = 1; set_hsr_tag_path(&hsr_ethhdr->hsr_tag, path_id); } static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, struct hsr_port *port, u8 proto_version) { struct hsr_ethhdr *hsr_ethhdr; unsigned char *pc; int lsdu_size; /* pad to minimum packet size which is 60 + 6 (HSR tag) */ if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) return NULL; lsdu_size = skb->len - 14; if (frame->is_vlan) lsdu_size -= 4; pc = skb_mac_header(skb); if (frame->is_vlan) /* This 4-byte shift (size of a vlan tag) does not * mean that the ethhdr starts there. But rather it * provides the proper environment for accessing * the fields, such as hsr_tag etc., just like * when the vlan tag is not there. This is because * the hsr tag is after the vlan tag. */ hsr_ethhdr = (struct hsr_ethhdr *)(pc + VLAN_HLEN); else hsr_ethhdr = (struct hsr_ethhdr *)pc; hsr_set_path_id(hsr_ethhdr, port); set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; hsr_ethhdr->ethhdr.h_proto = htons(proto_version ? ETH_P_HSR : ETH_P_PRP); skb->protocol = hsr_ethhdr->ethhdr.h_proto; return skb; } /* If the original frame was an HSR tagged frame, just clone it to be sent * unchanged. Otherwise, create a private frame especially tagged for 'port'. */ struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { unsigned char *dst, *src; struct sk_buff *skb; int movelen; if (frame->skb_hsr) { struct hsr_ethhdr *hsr_ethhdr = (struct hsr_ethhdr *)skb_mac_header(frame->skb_hsr); /* set the lane id properly */ hsr_set_path_id(hsr_ethhdr, port); return skb_clone(frame->skb_hsr, GFP_ATOMIC); } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { return skb_clone(frame->skb_std, GFP_ATOMIC); } /* Create the new skb with enough headroom to fit the HSR tag */ skb = __pskb_copy(frame->skb_std, skb_headroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC); if (!skb) return NULL; skb_reset_mac_header(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start += HSR_HLEN; movelen = ETH_HLEN; if (frame->is_vlan) movelen += VLAN_HLEN; src = skb_mac_header(skb); dst = skb_push(skb, HSR_HLEN); memmove(dst, src, movelen); skb_reset_mac_header(skb); /* skb_put_padto free skb on error and hsr_fill_tag returns NULL in * that case */ return hsr_fill_tag(skb, frame, port, port->hsr->prot_version); } struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { struct sk_buff *skb; if (frame->skb_prp) { struct prp_rct *trailer = skb_get_PRP_rct(frame->skb_prp); if (trailer) { prp_set_lan_id(trailer, port); } else { WARN_ONCE(!trailer, "errored PRP skb"); return NULL; } return skb_clone(frame->skb_prp, GFP_ATOMIC); } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { return skb_clone(frame->skb_std, GFP_ATOMIC); } skb = skb_copy_expand(frame->skb_std, skb_headroom(frame->skb_std), skb_tailroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC); return prp_fill_rct(skb, frame, port); } static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, struct hsr_node *node_src) { bool was_multicast_frame; int res, recv_len; was_multicast_frame = (skb->pkt_type == PACKET_MULTICAST); hsr_addr_subst_source(node_src, skb); skb_pull(skb, ETH_HLEN); recv_len = skb->len; res = netif_rx(skb); if (res == NET_RX_DROP) { dev->stats.rx_dropped++; } else { dev->stats.rx_packets++; dev->stats.rx_bytes += recv_len; if (was_multicast_frame) dev->stats.multicast++; } } static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port, struct hsr_frame_info *frame) { if (frame->port_rcv->type == HSR_PT_MASTER) { hsr_addr_subst_dest(frame->node_src, skb, port); /* Address substitution (IEC62439-3 pp 26, 50): replace mac * address of outgoing frame with that of the outgoing slave's. */ ether_addr_copy(eth_hdr(skb)->h_source, port->dev->dev_addr); } /* When HSR node is used as RedBox - the frame received from HSR ring * requires source MAC address (SA) replacement to one which can be * recognized by SAN devices (otherwise, frames are dropped by switch) */ if (port->type == HSR_PT_INTERLINK) ether_addr_copy(eth_hdr(skb)->h_source, port->hsr->macaddress_redbox); return dev_queue_xmit(skb); } bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) { return ((frame->port_rcv->type == HSR_PT_SLAVE_A && port->type == HSR_PT_SLAVE_B) || (frame->port_rcv->type == HSR_PT_SLAVE_B && port->type == HSR_PT_SLAVE_A)); } bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) { struct sk_buff *skb; if (port->dev->features & NETIF_F_HW_HSR_FWD) return prp_drop_frame(frame, port); /* RedBox specific frames dropping policies * * Do not send HSR supervisory frames to SAN devices */ if (frame->is_supervision && port->type == HSR_PT_INTERLINK) return true; /* Do not forward to other HSR port (A or B) unicast frames which * are addressed to interlink port (and are in the ProxyNodeTable). */ skb = frame->skb_hsr; if (skb && prp_drop_frame(frame, port) && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && hsr_is_node_in_db(&port->hsr->proxy_node_db, eth_hdr(skb)->h_dest)) { return true; } /* Do not forward to port C (Interlink) frames from nodes A and B * if DA is in NodeTable. */ if ((frame->port_rcv->type == HSR_PT_SLAVE_A || frame->port_rcv->type == HSR_PT_SLAVE_B) && port->type == HSR_PT_INTERLINK) { skb = frame->skb_hsr; if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && hsr_is_node_in_db(&port->hsr->node_db, eth_hdr(skb)->h_dest)) { return true; } } /* Do not forward to port A and B unicast frames received on the * interlink port if it is addressed to one of nodes registered in * the ProxyNodeTable. */ if ((port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) && frame->port_rcv->type == HSR_PT_INTERLINK) { skb = frame->skb_std; if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && hsr_is_node_in_db(&port->hsr->proxy_node_db, eth_hdr(skb)->h_dest)) { return true; } } return false; } /* Forward the frame through all devices except: * - Back through the receiving device * - If it's a HSR frame: through a device where it has passed before * - if it's a PRP frame: through another PRP slave device (no bridge) * - To the local HSR master only if the frame is directly addressed to it, or * a non-supervision multicast or broadcast frame. * * HSR slave devices should insert a HSR tag into the frame, or forward the * frame unchanged if it's already tagged. Interlink devices should strip HSR * tags if they're of the non-HSR type (but only after duplicate discard). The * master device always strips HSR tags. */ static void hsr_forward_do(struct hsr_frame_info *frame) { struct hsr_port *port; struct sk_buff *skb; bool sent = false; hsr_for_each_port(frame->port_rcv->hsr, port) { struct hsr_priv *hsr = port->hsr; /* Don't send frame back the way it came */ if (port == frame->port_rcv) continue; /* Don't deliver locally unless we should */ if (port->type == HSR_PT_MASTER && !frame->is_local_dest) continue; /* Deliver frames directly addressed to us to master only */ if (port->type != HSR_PT_MASTER && frame->is_local_exclusive) continue; /* If hardware duplicate generation is enabled, only send out * one port. */ if ((port->dev->features & NETIF_F_HW_HSR_DUP) && sent) continue; /* Don't send frame over port where it has been sent before. * Also for SAN, this shouldn't be done. */ if (!frame->is_from_san && hsr->proto_ops->register_frame_out && hsr->proto_ops->register_frame_out(port, frame)) continue; if (frame->is_supervision && port->type == HSR_PT_MASTER && !frame->is_proxy_supervision) { hsr_handle_sup_frame(frame); continue; } /* Check if frame is to be dropped. Eg. for PRP no forward * between ports, or sending HSR supervision to RedBox. */ if (hsr->proto_ops->drop_frame && hsr->proto_ops->drop_frame(frame, port)) continue; if (port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) skb = hsr->proto_ops->create_tagged_frame(frame, port); else skb = hsr->proto_ops->get_untagged_frame(frame, port); if (!skb) { frame->port_rcv->dev->stats.rx_dropped++; continue; } skb->dev = port->dev; if (port->type == HSR_PT_MASTER) { hsr_deliver_master(skb, port->dev, frame->node_src); } else { if (!hsr_xmit(skb, port, frame)) if (port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) sent = true; } } } static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb, struct hsr_frame_info *frame) { if (hsr_addr_is_self(hsr, eth_hdr(skb)->h_dest)) { frame->is_local_exclusive = true; skb->pkt_type = PACKET_HOST; } else { frame->is_local_exclusive = false; } if (skb->pkt_type == PACKET_HOST || skb->pkt_type == PACKET_MULTICAST || skb->pkt_type == PACKET_BROADCAST) { frame->is_local_dest = true; } else { frame->is_local_dest = false; } } static void handle_std_frame(struct sk_buff *skb, struct hsr_frame_info *frame) { struct hsr_port *port = frame->port_rcv; struct hsr_priv *hsr = port->hsr; frame->skb_hsr = NULL; frame->skb_prp = NULL; frame->skb_std = skb; if (port->type != HSR_PT_MASTER) frame->is_from_san = true; if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) { /* Sequence nr for the master/interlink node */ lockdep_assert_held(&hsr->seqnr_lock); frame->sequence_nr = hsr->sequence_nr; hsr->sequence_nr++; } } int hsr_fill_frame_info(__be16 proto, struct sk_buff *skb, struct hsr_frame_info *frame) { struct hsr_port *port = frame->port_rcv; struct hsr_priv *hsr = port->hsr; /* HSRv0 supervisory frames double as a tag so treat them as tagged. */ if ((!hsr->prot_version && proto == htons(ETH_P_PRP)) || proto == htons(ETH_P_HSR)) { /* Check if skb contains hsr_ethhdr */ if (skb->mac_len < sizeof(struct hsr_ethhdr)) return -EINVAL; /* HSR tagged frame :- Data or Supervision */ frame->skb_std = NULL; frame->skb_prp = NULL; frame->skb_hsr = skb; frame->sequence_nr = hsr_get_skb_sequence_nr(skb); return 0; } /* Standard frame or PRP from master port */ handle_std_frame(skb, frame); return 0; } int prp_fill_frame_info(__be16 proto, struct sk_buff *skb, struct hsr_frame_info *frame) { /* Supervision frame */ struct prp_rct *rct = skb_get_PRP_rct(skb); if (rct && prp_check_lsdu_size(skb, rct, frame->is_supervision)) { frame->skb_hsr = NULL; frame->skb_std = NULL; frame->skb_prp = skb; frame->sequence_nr = prp_get_skb_sequence_nr(rct); return 0; } handle_std_frame(skb, frame); return 0; } static int fill_frame_info(struct hsr_frame_info *frame, struct sk_buff *skb, struct hsr_port *port) { struct hsr_priv *hsr = port->hsr; struct hsr_vlan_ethhdr *vlan_hdr; struct list_head *n_db; struct ethhdr *ethhdr; __be16 proto; int ret; /* Check if skb contains ethhdr */ if (skb->mac_len < sizeof(struct ethhdr)) return -EINVAL; memset(frame, 0, sizeof(*frame)); frame->is_supervision = is_supervision_frame(port->hsr, skb); if (frame->is_supervision && hsr->redbox) frame->is_proxy_supervision = is_proxy_supervision_frame(port->hsr, skb); n_db = &hsr->node_db; if (port->type == HSR_PT_INTERLINK) n_db = &hsr->proxy_node_db; frame->node_src = hsr_get_node(port, n_db, skb, frame->is_supervision, port->type); if (!frame->node_src) return -1; /* Unknown node and !is_supervision, or no mem */ ethhdr = (struct ethhdr *)skb_mac_header(skb); frame->is_vlan = false; proto = ethhdr->h_proto; if (proto == htons(ETH_P_8021Q)) frame->is_vlan = true; if (frame->is_vlan) { /* Note: skb->mac_len might be wrong here. */ if (!pskb_may_pull(skb, skb_mac_offset(skb) + offsetofend(struct hsr_vlan_ethhdr, vlanhdr))) return -EINVAL; vlan_hdr = (struct hsr_vlan_ethhdr *)skb_mac_header(skb); proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; } frame->is_from_san = false; frame->port_rcv = port; ret = hsr->proto_ops->fill_frame_info(proto, skb, frame); if (ret) return ret; check_local_dest(port->hsr, skb, frame); return 0; } /* Must be called holding rcu read lock (because of the port parameter) */ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) { struct hsr_frame_info frame; rcu_read_lock(); if (fill_frame_info(&frame, skb, port) < 0) goto out_drop; hsr_register_frame_in(frame.node_src, port, frame.sequence_nr); hsr_forward_do(&frame); rcu_read_unlock(); /* Gets called for ingress frames as well as egress from master port. * So check and increment stats for master port only here. */ if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) { port->dev->stats.tx_packets++; port->dev->stats.tx_bytes += skb->len; } kfree_skb(frame.skb_hsr); kfree_skb(frame.skb_prp); kfree_skb(frame.skb_std); return; out_drop: rcu_read_unlock(); port->dev->stats.tx_dropped++; kfree_skb(skb); } |
| 1 1 2 1 1 1 1 1 1 67 114 86 128 83 19 57 1 58 1 63 4 58 59 4 127 164 165 165 165 162 54 108 78 124 41 6 78 78 78 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 | // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_btree_staging.h" #include "xfs_refcount_btree.h" #include "xfs_refcount.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_health.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_bit.h" #include "xfs_rmap.h" #include "xfs_ag.h" static struct kmem_cache *xfs_refcountbt_cur_cache; static struct xfs_btree_cur * xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, to_perag(cur->bc_group)); } STATIC void xfs_refcountbt_set_root( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int inc) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_refcount_root = ptr->s; be32_add_cpu(&agf->agf_refcount_level, inc); pag->pagf_refcount_level += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); } STATIC int xfs_refcountbt_alloc_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *start, union xfs_btree_ptr *new, int *stat) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; struct xfs_alloc_arg args; /* block allocation args */ int error; /* error return value */ memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.pag = to_perag(cur->bc_group); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; args.resv = XFS_AG_RESV_METADATA; error = xfs_alloc_vextent_near_bno(&args, xfs_agbno_to_fsb(args.pag, xfs_refc_block(args.mp))); if (error) goto out_error; if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } ASSERT(args.agno == cur->bc_group->xg_gno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); be32_add_cpu(&agf->agf_refcount_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); *stat = 1; return 0; out_error: return error; } STATIC int xfs_refcountbt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, 0); } STATIC int xfs_refcountbt_get_minrecs( struct xfs_btree_cur *cur, int level) { return cur->bc_mp->m_refc_mnr[level != 0]; } STATIC int xfs_refcountbt_get_maxrecs( struct xfs_btree_cur *cur, int level) { return cur->bc_mp->m_refc_mxr[level != 0]; } STATIC void xfs_refcountbt_init_key_from_rec( union xfs_btree_key *key, const union xfs_btree_rec *rec) { key->refc.rc_startblock = rec->refc.rc_startblock; } STATIC void xfs_refcountbt_init_high_key_from_rec( union xfs_btree_key *key, const union xfs_btree_rec *rec) { __u32 x; x = be32_to_cpu(rec->refc.rc_startblock); x += be32_to_cpu(rec->refc.rc_blockcount) - 1; key->refc.rc_startblock = cpu_to_be32(x); } STATIC void xfs_refcountbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; uint32_t start; start = xfs_refcount_encode_startblock(irec->rc_startblock, irec->rc_domain); rec->refc.rc_startblock = cpu_to_be32(start); rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); } STATIC void xfs_refcountbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } STATIC int64_t xfs_refcountbt_key_diff( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { const struct xfs_refcount_key *kp = &key->refc; const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; uint32_t start; start = xfs_refcount_encode_startblock(irec->rc_startblock, irec->rc_domain); return (int64_t)be32_to_cpu(kp->rc_startblock) - start; } STATIC int64_t xfs_refcountbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, const union xfs_btree_key *mask) { ASSERT(!mask || mask->refc.rc_startblock); return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - be32_to_cpu(k2->refc.rc_startblock); } STATIC xfs_failaddr_t xfs_refcountbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; unsigned int level; if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_has_reflink(mp)) return __this_address; fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; level = be16_to_cpu(block->bb_level); if (pag && xfs_perag_initialised_agf(pag)) { unsigned int maxlevel = pag->pagf_refcount_level; #ifdef CONFIG_XFS_ONLINE_REPAIR /* * Online repair could be rewriting the refcount btree, so * we'll validate against the larger of either tree while this * is going on. */ maxlevel = max_t(unsigned int, maxlevel, pag->pagf_repair_refcount_level); #endif if (level >= maxlevel) return __this_address; } else if (level >= mp->m_refc_maxlevels) return __this_address; return xfs_btree_agblock_verify(bp, mp->m_refc_mxr[level != 0]); } STATIC void xfs_refcountbt_read_verify( struct xfs_buf *bp) { xfs_failaddr_t fa; if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_refcountbt_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); } STATIC void xfs_refcountbt_write_verify( struct xfs_buf *bp) { xfs_failaddr_t fa; fa = xfs_refcountbt_verify(bp); if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_agblock_calc_crc(bp); } const struct xfs_buf_ops xfs_refcountbt_buf_ops = { .name = "xfs_refcountbt", .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) }, .verify_read = xfs_refcountbt_read_verify, .verify_write = xfs_refcountbt_write_verify, .verify_struct = xfs_refcountbt_verify, }; STATIC int xfs_refcountbt_keys_inorder( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2) { return be32_to_cpu(k1->refc.rc_startblock) < be32_to_cpu(k2->refc.rc_startblock); } STATIC int xfs_refcountbt_recs_inorder( struct xfs_btree_cur *cur, const union xfs_btree_rec *r1, const union xfs_btree_rec *r2) { return be32_to_cpu(r1->refc.rc_startblock) + be32_to_cpu(r1->refc.rc_blockcount) <= be32_to_cpu(r2->refc.rc_startblock); } STATIC enum xbtree_key_contig xfs_refcountbt_keys_contiguous( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { ASSERT(!mask || mask->refc.rc_startblock); return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock), be32_to_cpu(key2->refc.rc_startblock)); } const struct xfs_btree_ops xfs_refcountbt_ops = { .name = "refcount", .type = XFS_BTREE_TYPE_AG, .rec_len = sizeof(struct xfs_refcount_rec), .key_len = sizeof(struct xfs_refcount_key), .ptr_len = XFS_BTREE_SHORT_PTR_LEN, .lru_refs = XFS_REFC_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2), .sick_mask = XFS_SICK_AG_REFCNTBT, .dup_cursor = xfs_refcountbt_dup_cursor, .set_root = xfs_refcountbt_set_root, .alloc_block = xfs_refcountbt_alloc_block, .free_block = xfs_refcountbt_free_block, .get_minrecs = xfs_refcountbt_get_minrecs, .get_maxrecs = xfs_refcountbt_get_maxrecs, .init_key_from_rec = xfs_refcountbt_init_key_from_rec, .init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec, .init_rec_from_cur = xfs_refcountbt_init_rec_from_cur, .init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur, .key_diff = xfs_refcountbt_key_diff, .buf_ops = &xfs_refcountbt_buf_ops, .diff_two_keys = xfs_refcountbt_diff_two_keys, .keys_inorder = xfs_refcountbt_keys_inorder, .recs_inorder = xfs_refcountbt_recs_inorder, .keys_contiguous = xfs_refcountbt_keys_contiguous, }; /* * Create a new refcount btree cursor. * * For staging cursors tp and agbp are NULL. */ struct xfs_btree_cur * xfs_refcountbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_perag *pag) { struct xfs_btree_cur *cur; ASSERT(pag_agno(pag) < mp->m_sb.sb_agcount); cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops, mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_refc.nr_ops = 0; cur->bc_refc.shape_changes = 0; cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agf *agf = agbp->b_addr; cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); } return cur; } /* * Swap in the new btree root. Once we pass this point the newly rebuilt btree * is in place and we have to kill off all the old btree blocks. */ void xfs_refcountbt_commit_staged_btree( struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp) { struct xfs_agf *agf = agbp->b_addr; struct xbtree_afakeroot *afake = cur->bc_ag.afake; ASSERT(cur->bc_flags & XFS_BTREE_STAGING); agf->agf_refcount_root = cpu_to_be32(afake->af_root); agf->agf_refcount_level = cpu_to_be32(afake->af_levels); agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks); xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS | XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); xfs_btree_commit_afakeroot(cur, tp, agbp); } /* Calculate number of records in a refcount btree block. */ static inline unsigned int xfs_refcountbt_block_maxrecs( unsigned int blocklen, bool leaf) { if (leaf) return blocklen / sizeof(struct xfs_refcount_rec); return blocklen / (sizeof(struct xfs_refcount_key) + sizeof(xfs_refcount_ptr_t)); } /* * Calculate the number of records in a refcount btree block. */ unsigned int xfs_refcountbt_maxrecs( struct xfs_mount *mp, unsigned int blocklen, bool leaf) { blocklen -= XFS_REFCOUNT_BLOCK_LEN; return xfs_refcountbt_block_maxrecs(blocklen, leaf); } /* Compute the max possible height of the maximally sized refcount btree. */ unsigned int xfs_refcountbt_maxlevels_ondisk(void) { unsigned int minrecs[2]; unsigned int blocklen; blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN; minrecs[0] = xfs_refcountbt_block_maxrecs(blocklen, true) / 2; minrecs[1] = xfs_refcountbt_block_maxrecs(blocklen, false) / 2; return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_CRC_AG_BLOCKS); } /* Compute the maximum height of a refcount btree. */ void xfs_refcountbt_compute_maxlevels( struct xfs_mount *mp) { if (!xfs_has_reflink(mp)) { mp->m_refc_maxlevels = 0; return; } mp->m_refc_maxlevels = xfs_btree_compute_maxlevels( mp->m_refc_mnr, mp->m_sb.sb_agblocks); ASSERT(mp->m_refc_maxlevels <= xfs_refcountbt_maxlevels_ondisk()); } /* Calculate the refcount btree size for some records. */ xfs_extlen_t xfs_refcountbt_calc_size( struct xfs_mount *mp, unsigned long long len) { return xfs_btree_calc_size(mp->m_refc_mnr, len); } /* * Calculate the maximum refcount btree size. */ xfs_extlen_t xfs_refcountbt_max_size( struct xfs_mount *mp, xfs_agblock_t agblocks) { /* Bail out if we're uninitialized, which can happen in mkfs. */ if (mp->m_refc_mxr[0] == 0) return 0; return xfs_refcountbt_calc_size(mp, agblocks); } /* * Figure out how many blocks to reserve and how many are used by this btree. */ int xfs_refcountbt_calc_reserves( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used) { struct xfs_buf *agbp; struct xfs_agf *agf; xfs_agblock_t agblocks; xfs_extlen_t tree_len; int error; if (!xfs_has_reflink(mp)) return 0; error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; agf = agbp->b_addr; agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_refcount_blocks); xfs_trans_brelse(tp, agbp); /* * The log is permanently allocated, so the space it occupies will * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; *ask += xfs_refcountbt_max_size(mp, agblocks); *used += tree_len; return error; } int __init xfs_refcountbt_init_cur_cache(void) { xfs_refcountbt_cur_cache = kmem_cache_create("xfs_refcbt_cur", xfs_btree_cur_sizeof(xfs_refcountbt_maxlevels_ondisk()), 0, 0, NULL); if (!xfs_refcountbt_cur_cache) return -ENOMEM; return 0; } void xfs_refcountbt_destroy_cur_cache(void) { kmem_cache_destroy(xfs_refcountbt_cur_cache); xfs_refcountbt_cur_cache = NULL; } |
| 69 17 17 86 70 66 1 37 2 2 4 4 4 1 60 1 1 4 6 9 42 1 2 103 103 66 56 39 1 5 41 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 | // SPDX-License-Identifier: GPL-2.0-or-later /* * User-space Probes (UProbes) for x86 * * Copyright (C) IBM Corporation, 2008-2011 * Authors: * Srikar Dronamraju * Jim Keniston */ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/kdebug.h> #include <asm/processor.h> #include <asm/insn.h> #include <asm/mmu_context.h> /* Post-execution fixups. */ /* Adjust IP back to vicinity of actual insn */ #define UPROBE_FIX_IP 0x01 /* Adjust the return address of a call insn */ #define UPROBE_FIX_CALL 0x02 /* Instruction will modify TF, don't change it */ #define UPROBE_FIX_SETF 0x04 #define UPROBE_FIX_RIP_SI 0x08 #define UPROBE_FIX_RIP_DI 0x10 #define UPROBE_FIX_RIP_BX 0x20 #define UPROBE_FIX_RIP_MASK \ (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX) #define UPROBE_TRAP_NR UINT_MAX /* Adaptations for mhiramat x86 decoder v14. */ #define OPCODE1(insn) ((insn)->opcode.bytes[0]) #define OPCODE2(insn) ((insn)->opcode.bytes[1]) #define OPCODE3(insn) ((insn)->opcode.bytes[2]) #define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ << (row % 32)) /* * Good-instruction tables for 32-bit apps. This is non-const and volatile * to keep gcc from statically optimizing it out, as variable_test_bit makes * some versions of gcc to think only *(unsigned long*) is used. * * Opcodes we'll probably never support: * 6c-6f - ins,outs. SEGVs if used in userspace * e4-e7 - in,out imm. SEGVs if used in userspace * ec-ef - in,out acc. SEGVs if used in userspace * cc - int3. SIGTRAP if used in userspace * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs * (why we support bound (62) then? it's similar, and similarly unused...) * f1 - int1. SIGTRAP if used in userspace * f4 - hlt. SEGVs if used in userspace * fa - cli. SEGVs if used in userspace * fb - sti. SEGVs if used in userspace * * Opcodes which need some work to be supported: * 07,17,1f - pop es/ss/ds * Normally not used in userspace, but would execute if used. * Can cause GP or stack exception if tries to load wrong segment descriptor. * We hesitate to run them under single step since kernel's handling * of userspace single-stepping (TF flag) is fragile. * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e) * on the same grounds that they are never used. * cd - int N. * Used by userspace for "int 80" syscall entry. (Other "int N" * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). * Not supported since kernel's handling of userspace single-stepping * (TF flag) is fragile. * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static volatile u32 good_insns_32[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; #else #define good_insns_32 NULL #endif /* Good-instruction tables for 64-bit apps. * * Genuinely invalid opcodes: * 06,07 - formerly push/pop es * 0e - formerly push cs * 16,17 - formerly push/pop ss * 1e,1f - formerly push/pop ds * 27,2f,37,3f - formerly daa/das/aaa/aas * 60,61 - formerly pusha/popa * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported) * 82 - formerly redundant encoding of Group1 * 9a - formerly call seg:ofs * ce - formerly into * d4,d5 - formerly aam/aad * d6 - formerly undocumented salc * ea - formerly jmp seg:ofs * * Opcodes we'll probably never support: * 6c-6f - ins,outs. SEGVs if used in userspace * e4-e7 - in,out imm. SEGVs if used in userspace * ec-ef - in,out acc. SEGVs if used in userspace * cc - int3. SIGTRAP if used in userspace * f1 - int1. SIGTRAP if used in userspace * f4 - hlt. SEGVs if used in userspace * fa - cli. SEGVs if used in userspace * fb - sti. SEGVs if used in userspace * * Opcodes which need some work to be supported: * cd - int N. * Used by userspace for "int 80" syscall entry. (Other "int N" * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). * Not supported since kernel's handling of userspace single-stepping * (TF flag) is fragile. * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad */ #if defined(CONFIG_X86_64) static volatile u32 good_insns_64[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */ W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; #else #define good_insns_64 NULL #endif /* Using this for both 64-bit and 32-bit apps. * Opcodes we don't support: * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group. * Also encodes tons of other system insns if mod=11. * Some are in fact non-system: xend, xtest, rdtscp, maybe more * 0f 05 - syscall * 0f 06 - clts (CPL0 insn) * 0f 07 - sysret * 0f 08 - invd (CPL0 insn) * 0f 09 - wbinvd (CPL0 insn) * 0f 0b - ud2 * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?) * 0f 34 - sysenter * 0f 35 - sysexit * 0f 37 - getsec * 0f 78 - vmread (Intel VMX. CPL0 insn) * 0f 79 - vmwrite (Intel VMX. CPL0 insn) * Note: with prefixes, these two opcodes are * extrq/insertq/AVX512 convert vector ops. * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt], * {rd,wr}{fs,gs}base,{s,l,m}fence. * Why? They are all user-executable. */ static volatile u32 good_2byte_insns[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; #undef W /* * opcodes we may need to refine support for: * * 0f - 2-byte instructions: For many of these instructions, the validity * depends on the prefix and/or the reg field. On such instructions, we * just consider the opcode combination valid if it corresponds to any * valid instruction. * * 8f - Group 1 - only reg = 0 is OK * c6-c7 - Group 11 - only reg = 0 is OK * d9-df - fpu insns with some illegal encodings * f2, f3 - repnz, repz prefixes. These are also the first byte for * certain floating-point instructions, such as addsd. * * fe - Group 4 - only reg = 0 or 1 is OK * ff - Group 5 - only reg = 0-6 is OK * * others -- Do we need to support these? * * 0f - (floating-point?) prefetch instructions * 07, 17, 1f - pop es, pop ss, pop ds * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- * but 64 and 65 (fs: and gs:) seem to be used, so we support them * 67 - addr16 prefix * ce - into * f0 - lock prefix */ /* * TODO: * - Where necessary, examine the modrm byte and allow only valid instructions * in the different Groups and fpu instructions. */ static bool is_prefix_bad(struct insn *insn) { insn_byte_t p; int i; for_each_insn_prefix(insn, i, p) { insn_attr_t attr; attr = inat_get_opcode_attribute(p); switch (attr) { case INAT_MAKE_PREFIX(INAT_PFX_ES): case INAT_MAKE_PREFIX(INAT_PFX_CS): case INAT_MAKE_PREFIX(INAT_PFX_DS): case INAT_MAKE_PREFIX(INAT_PFX_SS): case INAT_MAKE_PREFIX(INAT_PFX_LOCK): return true; } } return false; } static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64) { enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32; u32 volatile *good_insns; int ret; ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m); if (ret < 0) return -ENOEXEC; if (is_prefix_bad(insn)) return -ENOTSUPP; /* We should not singlestep on the exception masking instructions */ if (insn_masking_exception(insn)) return -ENOTSUPP; if (x86_64) good_insns = good_insns_64; else good_insns = good_insns_32; if (test_bit(OPCODE1(insn), (unsigned long *)good_insns)) return 0; if (insn->opcode.nbytes == 2) { if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) return 0; } return -ENOTSUPP; } #ifdef CONFIG_X86_64 asm ( ".pushsection .rodata\n" ".global uretprobe_trampoline_entry\n" "uretprobe_trampoline_entry:\n" "pushq %rax\n" "pushq %rcx\n" "pushq %r11\n" "movq $" __stringify(__NR_uretprobe) ", %rax\n" "syscall\n" ".global uretprobe_syscall_check\n" "uretprobe_syscall_check:\n" "popq %r11\n" "popq %rcx\n" /* The uretprobe syscall replaces stored %rax value with final * return address, so we don't restore %rax in here and just * call ret. */ "retq\n" ".global uretprobe_trampoline_end\n" "uretprobe_trampoline_end:\n" ".popsection\n" ); extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; void *arch_uprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); /* * At the moment the uretprobe syscall trampoline is supported * only for native 64-bit process, the compat process still uses * standard breakpoint. */ if (user_64bit_mode(regs)) { *psize = uretprobe_trampoline_end - uretprobe_trampoline_entry; return uretprobe_trampoline_entry; } *psize = UPROBE_SWBP_INSN_SIZE; return &insn; } static unsigned long trampoline_check_ip(unsigned long tramp) { return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry); } SYSCALL_DEFINE0(uretprobe) { struct pt_regs *regs = task_pt_regs(current); unsigned long err, ip, sp, r11_cx_ax[3], tramp; /* If there's no trampoline, we are called from wrong place. */ tramp = uprobe_get_trampoline_vaddr(); if (unlikely(tramp == UPROBE_NO_TRAMPOLINE_VADDR)) goto sigill; /* Make sure the ip matches the only allowed sys_uretprobe caller. */ if (unlikely(regs->ip != trampoline_check_ip(tramp))) goto sigill; err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); if (err) goto sigill; /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ regs->r11 = r11_cx_ax[0]; regs->cx = r11_cx_ax[1]; regs->ax = r11_cx_ax[2]; regs->sp += sizeof(r11_cx_ax); regs->orig_ax = -1; ip = regs->ip; sp = regs->sp; uprobe_handle_trampoline(regs); /* * Some of the uprobe consumers has changed sp, we can do nothing, * just return via iret. * .. or shadow stack is enabled, in which case we need to skip * return through the user space stack address. */ if (regs->sp != sp || shstk_is_enabled()) return regs->ax; regs->sp -= sizeof(r11_cx_ax); /* for the case uprobe_consumer has changed r11/cx */ r11_cx_ax[0] = regs->r11; r11_cx_ax[1] = regs->cx; /* * ax register is passed through as return value, so we can use * its space on stack for ip value and jump to it through the * trampoline's ret instruction */ r11_cx_ax[2] = regs->ip; regs->ip = ip; err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax)); if (err) goto sigill; /* ensure sysret, see do_syscall_64() */ regs->r11 = regs->flags; regs->cx = regs->ip; return regs->ax; sigill: force_sig(SIGILL); return -1; } /* * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses * its memory operand indirectly through a scratch register. Set * defparam->fixups accordingly. (The contents of the scratch register * will be saved before we single-step the modified instruction, * and restored afterward). * * We do this because a rip-relative instruction can access only a * relatively small area (+/- 2 GB from the instruction), and the XOL * area typically lies beyond that area. At least for instructions * that store to memory, we can't execute the original instruction * and "fix things up" later, because the misdirected store could be * disastrous. * * Some useful facts about rip-relative instructions: * * - There's always a modrm byte with bit layout "00 reg 101". * - There's never a SIB byte. * - The displacement is always 4 bytes. * - REX.B=1 bit in REX prefix, which normally extends r/m field, * has no effect on rip-relative mode. It doesn't make modrm byte * with r/m=101 refer to register 1101 = R13. */ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; u8 reg2; if (!insn_rip_relative(insn)) return; /* * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm. * Clear REX.b bit (extension of MODRM.rm field): * we want to encode low numbered reg, not r8+. */ if (insn->rex_prefix.nbytes) { cursor = auprobe->insn + insn_offset_rex_prefix(insn); /* REX byte has 0100wrxb layout, clearing REX.b bit */ *cursor &= 0xfe; } /* * Similar treatment for VEX3/EVEX prefix. * TODO: add XOP treatment when insn decoder supports them */ if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5 rvvvvLpp (has no b bit) * vex3/xop: c4/8f rxbmmmmm wvvvvLpp * evex: 62 rxbR00mm wvvvv1pp zllBVaaa * Setting VEX3.b (setting because it has inverted meaning). * Setting EVEX.x since (in non-SIB encoding) EVEX.x * is the 4th bit of MODRM.rm, and needs the same treatment. * For VEX3-encoded insns, VEX3.x value has no effect in * non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; *cursor |= 0x60; } /* * Convert from rip-relative addressing to register-relative addressing * via a scratch register. * * This is tricky since there are insns with modrm byte * which also use registers not encoded in modrm byte: * [i]div/[i]mul: implicitly use dx:ax * shift ops: implicitly use cx * cmpxchg: implicitly uses ax * cmpxchg8/16b: implicitly uses dx:ax and bx:cx * Encoding: 0f c7/1 modrm * The code below thinks that reg=1 (cx), chooses si as scratch. * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m. * First appeared in Haswell (BMI2 insn). It is vex-encoded. * Example where none of bx,cx,dx can be used as scratch reg: * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx * [v]pcmpistri: implicitly uses cx, xmm0 * [v]pcmpistrm: implicitly uses xmm0 * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0 * [v]pcmpestrm: implicitly uses ax, dx, xmm0 * Evil SSE4.2 string comparison ops from hell. * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination. * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm. * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi). * AMD says it has no 3-operand form (vex.vvvv must be 1111) * and that it can have only register operands, not mem * (its modrm byte must have mode=11). * If these restrictions will ever be lifted, * we'll need code to prevent selection of di as scratch reg! * * Summary: I don't know any insns with modrm byte which * use SI register implicitly. DI register is used only * by one insn (maskmovq) and BX register is used * only by one too (cmpxchg8b). * BP is stack-segment based (may be a problem?). * AX, DX, CX are off-limits (many implicit users). * SP is unusable (it's stack pointer - think about "pop mem"; * also, rsp+disp32 needs sib encoding -> insn length change). */ reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff; /* Fetch vex.vvvv */ if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* * TODO: add XOP vvvv reading. * * vex.vvvv field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored. * Therefore, let's consider only 3 low-order bits. */ reg2 = ((reg2 >> 3) & 0x7) ^ 0x7; /* * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15. * * Choose scratch reg. Order is important: must not select bx * if we can use si (cmpxchg8b case!) */ if (reg != 6 && reg2 != 6) { reg2 = 6; auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI; } else if (reg != 7 && reg2 != 7) { reg2 = 7; auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI; /* TODO (paranoia): force maskmovq to not use di */ } else { reg2 = 3; auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX; } /* * Point cursor at the modrm byte. The next 4 bytes are the * displacement. Beyond the displacement, for some instructions, * is the immediate operand. */ cursor = auprobe->insn + insn_offset_modrm(insn); /* * Change modrm from "00 reg 101" to "10 reg reg2". Example: * 89 05 disp32 mov %eax,disp32(%rip) becomes * 89 86 disp32 mov %eax,disp32(%rsi) */ *cursor = 0x80 | (reg << 3) | reg2; } static inline unsigned long * scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI) return ®s->si; if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI) return ®s->di; return ®s->bx; } /* * If we're emulating a rip-relative instruction, save the contents * of the scratch register and store the target address in that register. */ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); utask->autask.saved_scratch_register = *sr; *sr = utask->vaddr + auprobe->defparam.ilen; } } static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); *sr = utask->autask.saved_scratch_register; } } #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit */ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { } static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { bool (*emulate)(struct arch_uprobe *, struct pt_regs *); int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); int (*post_xol)(struct arch_uprobe *, struct pt_regs *); void (*abort)(struct arch_uprobe *, struct pt_regs *); }; static inline int sizeof_long(struct pt_regs *regs) { /* * Check registers for mode as in_xxx_syscall() does not apply here. */ return user_64bit_mode(regs) ? 8 : 4; } static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { riprel_pre_xol(auprobe, regs); return 0; } static int emulate_push_stack(struct pt_regs *regs, unsigned long val) { unsigned long new_sp = regs->sp - sizeof_long(regs); if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs))) return -EFAULT; regs->sp = new_sp; return 0; } /* * We have to fix things up as follows: * * Typically, the new ip is relative to the copied instruction. We need * to make it relative to the original instruction (FIX_IP). Exceptions * are return instructions and absolute or indirect jump or call instructions. * * If the single-stepped instruction was a call, the return address that * is atop the stack is the address following the copied instruction. We * need to make it the address following the original instruction (FIX_CALL). * * If the original instruction was a rip-relative instruction such as * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)". * We need to restore the contents of the scratch register * (FIX_RIP_reg). */ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; riprel_post_xol(auprobe, regs); if (auprobe->defparam.fixups & UPROBE_FIX_IP) { long correction = utask->vaddr - utask->xol_vaddr; regs->ip += correction; } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { regs->sp += sizeof_long(regs); /* Pop incorrect return address */ if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen)) return -ERESTART; } /* popf; tell the caller to not touch TF */ if (auprobe->defparam.fixups & UPROBE_FIX_SETF) utask->autask.saved_tf = true; return 0; } static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { riprel_post_xol(auprobe, regs); } static const struct uprobe_xol_ops default_xol_ops = { .pre_xol = default_pre_xol_op, .post_xol = default_post_xol_op, .abort = default_abort_op, }; static bool branch_is_call(struct arch_uprobe *auprobe) { return auprobe->branch.opc1 == 0xe8; } #define CASE_COND \ COND(70, 71, XF(OF)) \ COND(72, 73, XF(CF)) \ COND(74, 75, XF(ZF)) \ COND(78, 79, XF(SF)) \ COND(7a, 7b, XF(PF)) \ COND(76, 77, XF(CF) || XF(ZF)) \ COND(7c, 7d, XF(SF) != XF(OF)) \ COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) #define COND(op_y, op_n, expr) \ case 0x ## op_y: DO((expr) != 0) \ case 0x ## op_n: DO((expr) == 0) #define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) static bool is_cond_jmp_opcode(u8 opcode) { switch (opcode) { #define DO(expr) \ return true; CASE_COND #undef DO default: return false; } } static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) { unsigned long flags = regs->flags; switch (auprobe->branch.opc1) { #define DO(expr) \ return expr; CASE_COND #undef DO default: /* not a conditional jmp */ return true; } } #undef XF #undef COND #undef CASE_COND static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { unsigned long new_ip = regs->ip += auprobe->branch.ilen; unsigned long offs = (long)auprobe->branch.offs; if (branch_is_call(auprobe)) { /* * If it fails we execute this (mangled, see the comment in * branch_clear_offset) insn out-of-line. In the likely case * this should trigger the trap, and the probed application * should die or restart the same insn after it handles the * signal, arch_uprobe_post_xol() won't be even called. * * But there is corner case, see the comment in ->post_xol(). */ if (emulate_push_stack(regs, new_ip)) return false; } else if (!check_jmp_cond(auprobe, regs)) { offs = 0; } regs->ip = new_ip + offs; return true; } static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset; if (emulate_push_stack(regs, *src_ptr)) return false; regs->ip += auprobe->push.ilen; return true; } static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { BUG_ON(!branch_is_call(auprobe)); /* * We can only get here if branch_emulate_op() failed to push the ret * address _and_ another thread expanded our stack before the (mangled) * "call" insn was executed out-of-line. Just restore ->sp and restart. * We could also restore ->ip and try to call branch_emulate_op() again. */ regs->sp += sizeof_long(regs); return -ERESTART; } static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) { /* * Turn this insn into "call 1f; 1:", this is what we will execute * out-of-line if ->emulate() fails. We only need this to generate * a trap, so that the probed task receives the correct signal with * the properly filled siginfo. * * But see the comment in ->post_xol(), in the unlikely case it can * succeed. So we need to ensure that the new ->ip can not fall into * the non-canonical area and trigger #GP. * * We could turn it into (say) "pushf", but then we would need to * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte * of ->insn[] for set_orig_insn(). */ memset(auprobe->insn + insn_offset_immediate(insn), 0, insn->immediate.nbytes); } static const struct uprobe_xol_ops branch_xol_ops = { .emulate = branch_emulate_op, .post_xol = branch_post_xol_op, }; static const struct uprobe_xol_ops push_xol_ops = { .emulate = push_emulate_op, }; /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn); insn_byte_t p; int i; switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ break; case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); break; case 0x0f: if (insn->opcode.nbytes != 2) return -ENOSYS; /* * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches * OPCODE1() of the "short" jmp which checks the same condition. */ opc1 = OPCODE2(insn) - 0x10; fallthrough; default: if (!is_cond_jmp_opcode(opc1)) return -ENOSYS; } /* * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported. * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. * No one uses these insns, reject any branch insns with such prefix. */ for_each_insn_prefix(insn, i, p) { if (p == 0x66) return -ENOTSUPP; } setup: auprobe->branch.opc1 = opc1; auprobe->branch.ilen = insn->length; auprobe->branch.offs = insn->immediate.value; auprobe->ops = &branch_xol_ops; return 0; } /* Returns -ENOSYS if push_xol_ops doesn't handle this insn */ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn), reg_offset = 0; if (opc1 < 0x50 || opc1 > 0x57) return -ENOSYS; if (insn->length > 2) return -ENOSYS; if (insn->length == 2) { /* only support rex_prefix 0x41 (x64 only) */ #ifdef CONFIG_X86_64 if (insn->rex_prefix.nbytes != 1 || insn->rex_prefix.bytes[0] != 0x41) return -ENOSYS; switch (opc1) { case 0x50: reg_offset = offsetof(struct pt_regs, r8); break; case 0x51: reg_offset = offsetof(struct pt_regs, r9); break; case 0x52: reg_offset = offsetof(struct pt_regs, r10); break; case 0x53: reg_offset = offsetof(struct pt_regs, r11); break; case 0x54: reg_offset = offsetof(struct pt_regs, r12); break; case 0x55: reg_offset = offsetof(struct pt_regs, r13); break; case 0x56: reg_offset = offsetof(struct pt_regs, r14); break; case 0x57: reg_offset = offsetof(struct pt_regs, r15); break; } #else return -ENOSYS; #endif } else { switch (opc1) { case 0x50: reg_offset = offsetof(struct pt_regs, ax); break; case 0x51: reg_offset = offsetof(struct pt_regs, cx); break; case 0x52: reg_offset = offsetof(struct pt_regs, dx); break; case 0x53: reg_offset = offsetof(struct pt_regs, bx); break; case 0x54: reg_offset = offsetof(struct pt_regs, sp); break; case 0x55: reg_offset = offsetof(struct pt_regs, bp); break; case 0x56: reg_offset = offsetof(struct pt_regs, si); break; case 0x57: reg_offset = offsetof(struct pt_regs, di); break; } } auprobe->push.reg_offset = reg_offset; auprobe->push.ilen = insn->length; auprobe->ops = &push_xol_ops; return 0; } /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @auprobe: the probepoint information. * @mm: the probed address space. * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { struct insn insn; u8 fix_ip_or_call = UPROBE_FIX_IP; int ret; ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); if (ret) return ret; ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; ret = push_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; /* * Figure out which fixups default_post_xol_op() will need to perform, * and annotate defparam->fixups accordingly. */ switch (OPCODE1(&insn)) { case 0x9d: /* popf */ auprobe->defparam.fixups |= UPROBE_FIX_SETF; break; case 0xc3: /* ret or lret -- ip is correct */ case 0xcb: case 0xc2: case 0xca: case 0xea: /* jmp absolute -- ip is correct */ fix_ip_or_call = 0; break; case 0x9a: /* call absolute - Fix return addr, not ip */ fix_ip_or_call = UPROBE_FIX_CALL; break; case 0xff: switch (MODRM_REG(&insn)) { case 2: case 3: /* call or lcall, indirect */ fix_ip_or_call = UPROBE_FIX_CALL; break; case 4: case 5: /* jmp or ljmp, indirect */ fix_ip_or_call = 0; break; } fallthrough; default: riprel_analyze(auprobe, &insn); } auprobe->defparam.ilen = insn.length; auprobe->defparam.fixups |= fix_ip_or_call; auprobe->ops = &default_xol_ops; return 0; } /* * arch_uprobe_pre_xol - prepare to execute out of line. * @auprobe: the probepoint information. * @regs: reflects the saved user state of current task. */ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (auprobe->ops->pre_xol) { int err = auprobe->ops->pre_xol(auprobe, regs); if (err) return err; } regs->ip = utask->xol_vaddr; utask->autask.saved_trap_nr = current->thread.trap_nr; current->thread.trap_nr = UPROBE_TRAP_NR; utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); regs->flags |= X86_EFLAGS_TF; if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) set_task_blockstep(current, false); return 0; } /* * If xol insn itself traps and generates a signal(Say, * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped * instruction jumps back to its own address. It is assumed that anything * like do_page_fault/do_trap/etc sets thread.trap_nr != -1. * * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr, * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol(). */ bool arch_uprobe_xol_was_trapped(struct task_struct *t) { if (t->thread.trap_nr != UPROBE_TRAP_NR) return true; return false; } /* * Called after single-stepping. To avoid the SMP problems that can * occur when we temporarily put back the original opcode to * single-step, we single-stepped a copy of the instruction. * * This function prepares to resume execution after the single-step. */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; bool send_sigtrap = utask->autask.saved_tf; int err = 0; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); current->thread.trap_nr = utask->autask.saved_trap_nr; if (auprobe->ops->post_xol) { err = auprobe->ops->post_xol(auprobe, regs); if (err) { /* * Restore ->ip for restart or post mortem analysis. * ->post_xol() must not return -ERESTART unless this * is really possible. */ regs->ip = utask->vaddr; if (err == -ERESTART) err = 0; send_sigtrap = false; } } /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP * so we can get an extra SIGTRAP if we do not clear TF. We need * to examine the opcode to make it right. */ if (send_sigtrap) send_sig(SIGTRAP, current, 0); if (!utask->autask.saved_tf) regs->flags &= ~X86_EFLAGS_TF; return err; } /* callback routine for handling exceptions. */ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data) { struct die_args *args = data; struct pt_regs *regs = args->regs; int ret = NOTIFY_DONE; /* We are only interested in userspace traps */ if (regs && !user_mode(regs)) return NOTIFY_DONE; switch (val) { case DIE_INT3: if (uprobe_pre_sstep_notifier(regs)) ret = NOTIFY_STOP; break; case DIE_DEBUG: if (uprobe_post_sstep_notifier(regs)) ret = NOTIFY_STOP; break; default: break; } return ret; } /* * This function gets called when XOL instruction either gets trapped or * the thread has a fatal signal. Reset the instruction pointer to its * probed address for the potential restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (auprobe->ops->abort) auprobe->ops->abort(auprobe, regs); current->thread.trap_nr = utask->autask.saved_trap_nr; regs->ip = utask->vaddr; /* clear TF if it was set by us in arch_uprobe_pre_xol() */ if (!utask->autask.saved_tf) regs->flags &= ~X86_EFLAGS_TF; } static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->ops->emulate) return auprobe->ops->emulate(auprobe, regs); return false; } bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { bool ret = __skip_sstep(auprobe, regs); if (ret && (regs->flags & X86_EFLAGS_TF)) send_sig(SIGTRAP, current, 0); return ret; } unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { int rasize = sizeof_long(regs), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) return -1; /* check whether address has been already hijacked */ if (orig_ret_vaddr == trampoline_vaddr) return orig_ret_vaddr; nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); if (likely(!nleft)) { if (shstk_update_last_frame(trampoline_vaddr)) { force_sig(SIGSEGV); return -1; } return orig_ret_vaddr; } if (nleft != rasize) { pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", current->pid, regs->sp, regs->ip); force_sig(SIGSEGV); } return -1; } bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs) { if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */ return regs->sp < ret->stack; else return regs->sp <= ret->stack; } |
| 3 3 1 2 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 | // SPDX-License-Identifier: GPL-2.0-only /* * IPV6 GSO/GRO offload support * Linux INET implementation * * Copyright (C) 2016 secunet Security Networks AG * Author: Steffen Klassert <steffen.klassert@secunet.com> * * ESP GRO support */ #include <linux/skbuff.h> #include <linux/init.h> #include <net/protocol.h> #include <crypto/aead.h> #include <crypto/authenc.h> #include <linux/err.h> #include <linux/module.h> #include <net/gro.h> #include <net/gso.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/esp.h> #include <linux/scatterlist.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/ip6_route.h> #include <net/ipv6.h> #include <linux/icmpv6.h> static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen) { int off = sizeof(struct ipv6hdr); struct ipv6_opt_hdr *exthdr; /* ESP or ESPINUDP */ if (likely(ipv6_hdr->nexthdr == NEXTHDR_ESP || ipv6_hdr->nexthdr == NEXTHDR_UDP)) return offsetof(struct ipv6hdr, nexthdr); while (off < nhlen) { exthdr = (void *)ipv6_hdr + off; if (exthdr->nexthdr == NEXTHDR_ESP) return off; off += ipv6_optlen(exthdr); } return 0; } static struct sk_buff *esp6_gro_receive(struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; struct xfrm_state *x; int encap_type = 0; __be32 seq; __be32 spi; int nhoff; if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP) encap_type = UDP_ENCAP_ESPINUDP; if (!pskb_pull(skb, offset)) return NULL; if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0) goto out; xo = xfrm_offload(skb); if (!xo || !(xo->flags & CRYPTO_DONE)) { struct sec_path *sp = secpath_set(skb); if (!sp) goto out; if (sp->len == XFRM_MAX_DEPTH) goto out_reset; x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ipv6_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET6); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ xfrm_state_put(x); x = NULL; } if (!x) goto out_reset; skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; sp->olen++; xo = xfrm_offload(skb); if (!xo) goto out_reset; } xo->flags |= XFRM_GRO; nhoff = esp6_nexthdr_esp_offset(ipv6_hdr(skb), offset); if (!nhoff) goto out; IP6CB(skb)->nhoff = nhoff; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); XFRM_SPI_SKB_CB(skb)->seq = seq; /* We don't need to handle errors from xfrm_input, it does all * the error handling and frees the resources on error. */ xfrm_input(skb, IPPROTO_ESP, spi, encap_type); return ERR_PTR(-EINPROGRESS); out_reset: secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; struct ipv6hdr *iph = ipv6_hdr(skb); struct xfrm_offload *xo = xfrm_offload(skb); u8 proto = iph->nexthdr; skb_push(skb, -skb_network_offset(skb)); if (x->outer_mode.encap == XFRM_MODE_TRANSPORT) { __be16 frag; ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, &frag); } esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); xo->proto = proto; } static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { __be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP) : htons(ETH_P_IPV6); return skb_eth_gso_segment(skb, features, type); } static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { const struct net_offload *ops; struct sk_buff *segs = ERR_PTR(-EINVAL); struct xfrm_offload *xo = xfrm_offload(skb); skb->transport_header += x->props.header_len; ops = rcu_dereference(inet6_offloads[xo->proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { struct xfrm_offload *xo = xfrm_offload(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; u8 proto = xo->proto; skb->transport_header += x->props.header_len; if (x->sel.family != AF_INET6) { skb->transport_header -= (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); if (proto == IPPROTO_BEETPH) { struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; skb->transport_header += ph->hdrlen * 8; proto = ph->nexthdr; } else { skb->transport_header -= IPV4_BEET_PHMAXLEN; } if (proto == IPPROTO_TCP) skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; } else { __be16 frag; skb->transport_header += ipv6_skip_exthdr(skb, 0, &proto, &frag); } if (proto == IPPROTO_IPIP) skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { switch (x->outer_mode.encap) { case XFRM_MODE_TUNNEL: return xfrm6_tunnel_gso_segment(x, skb, features); case XFRM_MODE_TRANSPORT: return xfrm6_transport_gso_segment(x, skb, features); case XFRM_MODE_BEET: return xfrm6_beet_gso_segment(x, skb, features); } return ERR_PTR(-EOPNOTSUPP); } static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); sp = skb_sec_path(skb); x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~(NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); xo->flags |= XFRM_GSO_SEGMENT; return xfrm6_outer_mode_gso_segment(x, skb, esp_features); } static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) { struct crypto_aead *aead = x->data; struct xfrm_offload *xo = xfrm_offload(skb); if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) return -EINVAL; if (!(xo->flags & CRYPTO_DONE)) skb->ip_summed = CHECKSUM_NONE; return esp6_input_done2(skb, 0); } static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { int len; int err; int alen; int blksize; struct xfrm_offload *xo; struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; __u32 seq; esp.inplace = true; xo = xfrm_offload(skb); if (!xo) return -EINVAL; if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } esp.proto = xo->proto; /* skb is pure payload to encrypt */ aead = x->data; alen = crypto_aead_authsize(aead); esp.tfclen = 0; /* XXX: Add support for tfc padding here. */ blksize = ALIGN(crypto_aead_blocksize(aead), 4); esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); esp.plen = esp.clen - skb->len - esp.tfclen; esp.tailen = esp.tfclen + esp.plen + alen; if (!hw_offload || !skb_is_gso(skb)) { esp.nfrags = esp6_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; } seq = xo->seq.low; esp.esph = ip_esp_hdr(skb); esp.esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { esp.esph->seq_no = htonl(seq); if (!skb_is_gso(skb)) xo->seq.low++; else xo->seq.low += skb_shinfo(skb)->gso_segs; } if (xo->seq.low < seq) xo->seq.hi++; esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); len = skb->len - sizeof(struct ipv6hdr); if (len > IPV6_MAXPLEN) len = 0; ipv6_hdr(skb)->payload_len = htons(len); if (hw_offload) { if (!skb_ext_add(skb, SKB_EXT_SEC_PATH)) return -ENOMEM; xo = xfrm_offload(skb); if (!xo) return -EINVAL; xo->flags |= XFRM_XMIT; return 0; } err = esp6_output_tail(x, skb, &esp); if (err) return err; secpath_reset(skb); if (skb_needs_linearize(skb, skb->dev->features) && __skb_linearize(skb)) return -ENOMEM; return 0; } static const struct net_offload esp6_offload = { .callbacks = { .gro_receive = esp6_gro_receive, .gso_segment = esp6_gso_segment, }, }; static const struct xfrm_type_offload esp6_type_offload = { .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp6_input_tail, .xmit = esp6_xmit, .encap = esp6_gso_encap, }; static int __init esp6_offload_init(void) { if (xfrm_register_type_offload(&esp6_type_offload, AF_INET6) < 0) { pr_info("%s: can't add xfrm type offload\n", __func__); return -EAGAIN; } return inet6_add_offload(&esp6_offload, IPPROTO_ESP); } static void __exit esp6_offload_exit(void) { xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6); inet6_del_offload(&esp6_offload, IPPROTO_ESP); } module_init(esp6_offload_init); module_exit(esp6_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET6, XFRM_PROTO_ESP); MODULE_DESCRIPTION("IPV6 GSO/GRO offload support"); |
| 16 6 27 17 27 27 27 27 11 15 38 6 24 84 25 39 1 1 1 58 2 3 1 155 15 9 122 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_CLS_H #define __NET_PKT_CLS_H #include <linux/pkt_cls.h> #include <linux/workqueue.h> #include <net/sch_generic.h> #include <net/act_api.h> #include <net/net_namespace.h> /* TC action not accessible from user space */ #define TC_ACT_CONSUMED (TC_ACT_VALUE_MAX + 1) /* Basic packet classifier frontend definitions. */ struct tcf_walker { int stop; int skip; int count; bool nonempty; unsigned long cookie; int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *); }; int register_tcf_proto_ops(struct tcf_proto_ops *ops); void unregister_tcf_proto_ops(struct tcf_proto_ops *ops); #define NET_CLS_ALIAS_PREFIX "net-cls-" #define MODULE_ALIAS_NET_CLS(kind) MODULE_ALIAS(NET_CLS_ALIAS_PREFIX kind) struct tcf_block_ext_info { enum flow_block_binder_type binder_type; tcf_chain_head_change_t *chain_head_change; void *chain_head_change_priv; u32 block_index; }; struct tcf_qevent { struct tcf_block *block; struct tcf_block_ext_info info; struct tcf_proto __rcu *filter_chain; }; struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index); void tcf_chain_put_by_act(struct tcf_chain *chain); struct tcf_chain *tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain); struct tcf_proto *tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack); int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack); void tcf_block_put(struct tcf_block *block); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, int police, struct tcf_proto *tp, u32 handle, bool used_action_miss); static inline bool tcf_block_shared(struct tcf_block *block) { return block->index; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return block && block->index; } #ifdef CONFIG_NET_CLS_ACT DECLARE_STATIC_KEY_FALSE(tcf_sw_enabled_key); static inline bool tcf_block_bypass_sw(struct tcf_block *block) { return block && !atomic_read(&block->useswcnt); } #endif static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { WARN_ON(tcf_block_shared(block)); return block->q; } int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); static inline bool tc_cls_stats_dump(struct tcf_proto *tp, struct tcf_walker *arg, void *filter) { if (arg->count >= arg->skip && arg->fn(tp, filter, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #else static inline bool tcf_block_shared(struct tcf_block *block) { return false; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return false; } static inline int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack) { return 0; } static inline int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_block_put(struct tcf_block *block) { } static inline void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { } static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; } static inline int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { return TC_ACT_UNSPEC; } #endif static inline unsigned long __cls_set_class(unsigned long *clp, unsigned long cl) { return xchg(clp, cl); } static inline void __tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { unsigned long cl; cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); cl = __cls_set_class(&r->class, cl); if (cl) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { struct Qdisc *q = tp->chain->block->q; /* Check q as it is not set for shared blocks. In that case, * setting class is not supported. */ if (!q) return; sch_tree_lock(q); __tcf_bind_filter(q, r, base); sch_tree_unlock(q); } static inline void __tcf_unbind_filter(struct Qdisc *q, struct tcf_result *r) { unsigned long cl; if ((cl = __cls_set_class(&r->class, 0)) != 0) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { struct Qdisc *q = tp->chain->block->q; if (!q) return; __tcf_unbind_filter(q, r); } static inline void tc_cls_bind_class(u32 classid, unsigned long cl, void *q, struct tcf_result *res, unsigned long base) { if (res->classid == classid) { if (cl) __tcf_bind_filter(q, res, base); else __tcf_unbind_filter(q, res); } } struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ int nr_actions; struct tc_action **actions; struct net *net; netns_tracker ns_tracker; struct tcf_exts_miss_cookie_node *miss_cookie_node; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. */ int action; int police; }; static inline int tcf_exts_init(struct tcf_exts *exts, struct net *net, int action, int police) { #ifdef CONFIG_NET_CLS return tcf_exts_init_ex(exts, net, action, police, NULL, 0, false); #else return -EOPNOTSUPP; #endif } /* Return false if the netns is being destroyed in cleanup_net(). Callers * need to do cleanup synchronously in this case, otherwise may race with * tc_action_net_exit(). Return true for other cases. */ static inline bool tcf_exts_get_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT exts->net = maybe_get_net(exts->net); if (exts->net) netns_tracker_alloc(exts->net, &exts->ns_tracker, GFP_KERNEL); return exts->net != NULL; #else return true; #endif } static inline void tcf_exts_put_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT if (exts->net) put_net_track(exts->net, &exts->ns_tracker); #endif } #ifdef CONFIG_NET_CLS_ACT #define tcf_exts_for_each_action(i, a, exts) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = (exts)->actions[i]); i++) #else #define tcf_exts_for_each_action(i, a, exts) \ for (; 0; (void)(i), (void)(a), (void)(exts)) #endif #define tcf_act_for_each_action(i, a, actions) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = actions[i]); i++) static inline bool tc_act_in_hw(struct tc_action *act) { return !!act->in_hw_count; } static inline void tcf_exts_hw_stats_update(const struct tcf_exts *exts, struct flow_stats *stats, bool use_act_stats) { #ifdef CONFIG_NET_CLS_ACT int i; for (i = 0; i < exts->nr_actions; i++) { struct tc_action *a = exts->actions[i]; if (use_act_stats || tc_act_in_hw(a)) { if (!tcf_action_update_hw_stats(a)) continue; } preempt_disable(); tcf_action_stats_update(a, stats->bytes, stats->pkts, stats->drops, stats->lastused, true); preempt_enable(); a->used_hw_stats = stats->used_hw_stats; a->used_hw_stats_valid = stats->used_hw_stats_valid; } #endif } /** * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * * Returns: true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT return exts->nr_actions; #else return false; #endif } /** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer * @exts: tc filter extensions handle * @res: desired result * * Executes all configured extensions. Returns TC_ACT_OK on a normal execution, * a negative number if the filter must be considered unmatched or * a positive action code (TC_ACT_*) which must be returned to the * underlying layer. */ static inline int tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions, exts->nr_actions, res); #endif return TC_ACT_OK; } static inline int tcf_exts_exec_ex(struct sk_buff *skb, struct tcf_exts *exts, int act_index, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions + act_index, exts->nr_actions - act_index, res); #else return TC_ACT_OK; #endif } int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, struct netlink_ext_ack *extack); int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); /** * struct tcf_pkt_info - packet information * * @ptr: start of the pkt data * @nexthdr: offset of the next header */ struct tcf_pkt_info { unsigned char * ptr; int nexthdr; }; #ifdef CONFIG_NET_EMATCH struct tcf_ematch_ops; /** * struct tcf_ematch - extended match (ematch) * * @matchid: identifier to allow userspace to reidentify a match * @flags: flags specifying attributes and the relation to other matches * @ops: the operations lookup table of the corresponding ematch module * @datalen: length of the ematch specific configuration data * @data: ematch specific data * @net: the network namespace */ struct tcf_ematch { struct tcf_ematch_ops * ops; unsigned long data; unsigned int datalen; u16 matchid; u16 flags; struct net *net; }; static inline int tcf_em_is_container(struct tcf_ematch *em) { return !em->ops; } static inline int tcf_em_is_simple(struct tcf_ematch *em) { return em->flags & TCF_EM_SIMPLE; } static inline int tcf_em_is_inverted(struct tcf_ematch *em) { return em->flags & TCF_EM_INVERT; } static inline int tcf_em_last_match(struct tcf_ematch *em) { return (em->flags & TCF_EM_REL_MASK) == TCF_EM_REL_END; } static inline int tcf_em_early_end(struct tcf_ematch *em, int result) { if (tcf_em_last_match(em)) return 1; if (result == 0 && em->flags & TCF_EM_REL_AND) return 1; if (result != 0 && em->flags & TCF_EM_REL_OR) return 1; return 0; } /** * struct tcf_ematch_tree - ematch tree handle * * @hdr: ematch tree header supplied by userspace * @matches: array of ematches */ struct tcf_ematch_tree { struct tcf_ematch_tree_hdr hdr; struct tcf_ematch * matches; }; /** * struct tcf_ematch_ops - ematch module operations * * @kind: identifier (kind) of this ematch module * @datalen: length of expected configuration data (optional) * @change: called during validation (optional) * @match: called during ematch tree evaluation, must return 1/0 * @destroy: called during destroyage (optional) * @dump: called during dumping process (optional) * @owner: owner, must be set to THIS_MODULE * @link: link to previous/next ematch module (internal use) */ struct tcf_ematch_ops { int kind; int datalen; int (*change)(struct net *net, void *, int, struct tcf_ematch *); int (*match)(struct sk_buff *, struct tcf_ematch *, struct tcf_pkt_info *); void (*destroy)(struct tcf_ematch *); int (*dump)(struct sk_buff *, struct tcf_ematch *); struct module *owner; struct list_head link; }; int tcf_em_register(struct tcf_ematch_ops *); void tcf_em_unregister(struct tcf_ematch_ops *); int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *, struct tcf_ematch_tree *); void tcf_em_tree_destroy(struct tcf_ematch_tree *); int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int); int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, struct tcf_pkt_info *); /** * tcf_em_tree_match - evaluate an ematch tree * * @skb: socket buffer of the packet in question * @tree: ematch tree to be used for evaluation * @info: packet information examined by classifier * * This function matches @skb against the ematch tree in @tree by going * through all ematches respecting their logic relations returning * as soon as the result is obvious. * * Returns: 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, struct tcf_pkt_info *info) { if (tree->hdr.nmatches) return __tcf_em_tree_match(skb, tree, info); else return 1; } #define MODULE_ALIAS_TCF_EMATCH(kind) MODULE_ALIAS("ematch-kind-" __stringify(kind)) #else /* CONFIG_NET_EMATCH */ struct tcf_ematch_tree { }; #define tcf_em_tree_validate(tp, tb, t) ((void)(t), 0) #define tcf_em_tree_destroy(t) do { (void)(t); } while(0) #define tcf_em_tree_dump(skb, t, tlv) (0) #define tcf_em_tree_match(skb, t, info) ((void)(info), 1) #endif /* CONFIG_NET_EMATCH */ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) { switch (layer) { case TCF_LAYER_LINK: return skb_mac_header(skb); case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: return skb_transport_header(skb); } return NULL; } static inline int tcf_valid_offset(const struct sk_buff *skb, const unsigned char *ptr, const int len) { return likely((ptr + len) <= skb_tail_pointer(skb) && ptr >= skb->head && (ptr <= (ptr + len))); } static inline int tcf_change_indev(struct net *net, struct nlattr *indev_tlv, struct netlink_ext_ack *extack) { char indev[IFNAMSIZ]; struct net_device *dev; if (nla_strscpy(indev, indev_tlv, IFNAMSIZ) < 0) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Interface name too long"); return -EINVAL; } dev = __dev_get_by_name(net, indev); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Network device not found"); return -ENODEV; } return dev->ifindex; } static inline bool tcf_match_indev(struct sk_buff *skb, int ifindex) { if (!ifindex) return true; if (!skb->skb_iif) return false; return ifindex == skb->skb_iif; } int tc_setup_offload_action(struct flow_action *flow_action, const struct tcf_exts *exts, struct netlink_ext_ack *extack); void tc_cleanup_offload_action(struct flow_action *flow_action); int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], u32 miss_cookie_base, struct netlink_ext_ack *extack); int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held); int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *old_flags, unsigned int *old_in_hw_count, u32 *new_flags, unsigned int *new_in_hw_count, bool rtnl_held); int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, enum tc_setup_type type, void *type_data, void *cb_priv, u32 *flags, unsigned int *in_hw_count); unsigned int tcf_exts_num_actions(struct tcf_exts *exts); #ifdef CONFIG_NET_CLS_ACT int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch); int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret); int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe); #else static inline int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) { } static inline int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline struct sk_buff * tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret) { return skb; } static inline int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) { return 0; } #endif struct tc_cls_u32_knode { struct tcf_exts *exts; struct tcf_result *res; struct tc_u32_sel *sel; u32 handle; u32 val; u32 mask; u32 link_handle; u8 fshift; }; struct tc_cls_u32_hnode { u32 handle; u32 prio; unsigned int divisor; }; enum tc_clsu32_command { TC_CLSU32_NEW_KNODE, TC_CLSU32_REPLACE_KNODE, TC_CLSU32_DELETE_KNODE, TC_CLSU32_NEW_HNODE, TC_CLSU32_REPLACE_HNODE, TC_CLSU32_DELETE_HNODE, }; struct tc_cls_u32_offload { struct flow_cls_common_offload common; /* knode values */ enum tc_clsu32_command command; union { struct tc_cls_u32_knode knode; struct tc_cls_u32_hnode hnode; }; }; static inline bool tc_can_offload(const struct net_device *dev) { return dev->features & NETIF_F_HW_TC; } static inline bool tc_can_offload_extack(const struct net_device *dev, struct netlink_ext_ack *extack) { bool can = tc_can_offload(dev); if (!can) NL_SET_ERR_MSG(extack, "TC offload is disabled on net device"); return can; } static inline bool tc_cls_can_offload_and_chain0(const struct net_device *dev, struct flow_cls_common_offload *common) { if (!tc_can_offload_extack(dev, common->extack)) return false; if (common->chain_index) { NL_SET_ERR_MSG(common->extack, "Driver supports only offload of chain 0"); return false; } return true; } static inline bool tc_skip_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; } static inline bool tc_skip_sw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false; } /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static inline bool tc_flags_valid(u32 flags) { if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW | TCA_CLS_FLAGS_VERBOSE)) return false; flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW; if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))) return false; return true; } static inline bool tc_in_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false; } static inline void tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, const struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio >> 16; cls_common->skip_sw = tc_skip_sw(flags); if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } static inline void tcf_proto_update_usesw(struct tcf_proto *tp, u32 flags) { if (tp->usesw) return; if (tc_skip_sw(flags) && tc_in_hw(flags)) return; tp->usesw = true; } #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) { struct tc_skb_ext *tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); if (tc_skb_ext) memset(tc_skb_ext, 0, sizeof(*tc_skb_ext)); return tc_skb_ext; } #endif enum tc_matchall_command { TC_CLSMATCHALL_REPLACE, TC_CLSMATCHALL_DESTROY, TC_CLSMATCHALL_STATS, }; struct tc_cls_matchall_offload { struct flow_cls_common_offload common; enum tc_matchall_command command; struct flow_rule *rule; struct flow_stats stats; bool use_act_stats; unsigned long cookie; }; enum tc_clsbpf_command { TC_CLSBPF_OFFLOAD, TC_CLSBPF_STATS, }; struct tc_cls_bpf_offload { struct flow_cls_common_offload common; enum tc_clsbpf_command command; struct tcf_exts *exts; struct bpf_prog *prog; struct bpf_prog *oldprog; const char *name; bool exts_integrated; }; /* This structure holds cookie structure that is passed from user * to the kernel for actions and classifiers */ struct tc_cookie { u8 *data; u32 len; struct rcu_head rcu; }; struct tc_qopt_offload_stats { struct gnet_stats_basic_sync *bstats; struct gnet_stats_queue *qstats; }; enum tc_mq_command { TC_MQ_CREATE, TC_MQ_DESTROY, TC_MQ_STATS, TC_MQ_GRAFT, }; struct tc_mq_opt_offload_graft_params { unsigned long queue; u32 child_handle; }; struct tc_mq_qopt_offload { enum tc_mq_command command; u32 handle; union { struct tc_qopt_offload_stats stats; struct tc_mq_opt_offload_graft_params graft_params; }; }; enum tc_htb_command { /* Root */ TC_HTB_CREATE, /* Initialize HTB offload. */ TC_HTB_DESTROY, /* Destroy HTB offload. */ /* Classes */ /* Allocate qid and create leaf. */ TC_HTB_LEAF_ALLOC_QUEUE, /* Convert leaf to inner, preserve and return qid, create new leaf. */ TC_HTB_LEAF_TO_INNER, /* Delete leaf, while siblings remain. */ TC_HTB_LEAF_DEL, /* Delete leaf, convert parent to leaf, preserving qid. */ TC_HTB_LEAF_DEL_LAST, /* TC_HTB_LEAF_DEL_LAST, but delete driver data on hardware errors. */ TC_HTB_LEAF_DEL_LAST_FORCE, /* Modify parameters of a node. */ TC_HTB_NODE_MODIFY, /* Class qdisc */ TC_HTB_LEAF_QUERY_QUEUE, /* Query qid by classid. */ }; struct tc_htb_qopt_offload { struct netlink_ext_ack *extack; enum tc_htb_command command; u32 parent_classid; u16 classid; u16 qid; u32 quantum; u64 rate; u64 ceil; u8 prio; }; #define TC_HTB_CLASSID_ROOT U32_MAX enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, TC_RED_STATS, TC_RED_XSTATS, TC_RED_GRAFT, }; struct tc_red_qopt_offload_params { u32 min; u32 max; u32 probability; u32 limit; bool is_ecn; bool is_harddrop; bool is_nodrop; struct gnet_stats_queue *qstats; }; struct tc_red_qopt_offload { enum tc_red_command command; u32 handle; u32 parent; union { struct tc_red_qopt_offload_params set; struct tc_qopt_offload_stats stats; struct red_stats *xstats; u32 child_handle; }; }; enum tc_gred_command { TC_GRED_REPLACE, TC_GRED_DESTROY, TC_GRED_STATS, }; struct tc_gred_vq_qopt_offload_params { bool present; u32 limit; u32 prio; u32 min; u32 max; bool is_ecn; bool is_harddrop; u32 probability; /* Only need backlog, see struct tc_prio_qopt_offload_params */ u32 *backlog; }; struct tc_gred_qopt_offload_params { bool grio_on; bool wred_on; unsigned int dp_cnt; unsigned int dp_def; struct gnet_stats_queue *qstats; struct tc_gred_vq_qopt_offload_params tab[MAX_DPs]; }; struct tc_gred_qopt_offload_stats { struct gnet_stats_basic_sync bstats[MAX_DPs]; struct gnet_stats_queue qstats[MAX_DPs]; struct red_stats *xstats[MAX_DPs]; }; struct tc_gred_qopt_offload { enum tc_gred_command command; u32 handle; u32 parent; union { struct tc_gred_qopt_offload_params set; struct tc_gred_qopt_offload_stats stats; }; }; enum tc_prio_command { TC_PRIO_REPLACE, TC_PRIO_DESTROY, TC_PRIO_STATS, TC_PRIO_GRAFT, }; struct tc_prio_qopt_offload_params { int bands; u8 priomap[TC_PRIO_MAX + 1]; /* At the point of un-offloading the Qdisc, the reported backlog and * qlen need to be reduced by the portion that is in HW. */ struct gnet_stats_queue *qstats; }; struct tc_prio_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_prio_qopt_offload { enum tc_prio_command command; u32 handle; u32 parent; union { struct tc_prio_qopt_offload_params replace_params; struct tc_qopt_offload_stats stats; struct tc_prio_qopt_offload_graft_params graft_params; }; }; enum tc_root_command { TC_ROOT_GRAFT, }; struct tc_root_qopt_offload { enum tc_root_command command; u32 handle; bool ingress; }; enum tc_ets_command { TC_ETS_REPLACE, TC_ETS_DESTROY, TC_ETS_STATS, TC_ETS_GRAFT, }; struct tc_ets_qopt_offload_replace_params { unsigned int bands; u8 priomap[TC_PRIO_MAX + 1]; unsigned int quanta[TCQ_ETS_MAX_BANDS]; /* 0 for strict bands. */ unsigned int weights[TCQ_ETS_MAX_BANDS]; struct gnet_stats_queue *qstats; }; struct tc_ets_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_ets_qopt_offload { enum tc_ets_command command; u32 handle; u32 parent; union { struct tc_ets_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; struct tc_ets_qopt_offload_graft_params graft_params; }; }; enum tc_tbf_command { TC_TBF_REPLACE, TC_TBF_DESTROY, TC_TBF_STATS, TC_TBF_GRAFT, }; struct tc_tbf_qopt_offload_replace_params { struct psched_ratecfg rate; u32 max_size; struct gnet_stats_queue *qstats; }; struct tc_tbf_qopt_offload { enum tc_tbf_command command; u32 handle; u32 parent; union { struct tc_tbf_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; u32 child_handle; }; }; enum tc_fifo_command { TC_FIFO_REPLACE, TC_FIFO_DESTROY, TC_FIFO_STATS, }; struct tc_fifo_qopt_offload { enum tc_fifo_command command; u32 handle; u32 parent; union { struct tc_qopt_offload_stats stats; }; }; #ifdef CONFIG_NET_CLS_ACT DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc); void tc_skb_ext_tc_enable(void); void tc_skb_ext_tc_disable(void); #define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc) #else /* CONFIG_NET_CLS_ACT */ static inline void tc_skb_ext_tc_enable(void) { } static inline void tc_skb_ext_tc_disable(void) { } #define tc_skb_ext_tc_enabled() false #endif #endif |
| 6 6 6 6 6 6 6 6 7 6 3 1 6 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 | // SPDX-License-Identifier: GPL-2.0-only /* * LED Class Core * * Copyright (C) 2005 John Lenz <lenz@cs.wisc.edu> * Copyright (C) 2005-2007 Richard Purdie <rpurdie@openedhand.com> */ #include <linux/ctype.h> #include <linux/device.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/leds.h> #include <linux/list.h> #include <linux/module.h> #include <linux/property.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <uapi/linux/uleds.h> #include <linux/of.h> #include "leds.h" static DEFINE_MUTEX(leds_lookup_lock); static LIST_HEAD(leds_lookup_list); static struct workqueue_struct *leds_wq; static ssize_t brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned int brightness; mutex_lock(&led_cdev->led_access); led_update_brightness(led_cdev); brightness = led_cdev->brightness; mutex_unlock(&led_cdev->led_access); return sprintf(buf, "%u\n", brightness); } static ssize_t brightness_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned long state; ssize_t ret; mutex_lock(&led_cdev->led_access); if (led_sysfs_is_disabled(led_cdev)) { ret = -EBUSY; goto unlock; } ret = kstrtoul(buf, 10, &state); if (ret) goto unlock; if (state == LED_OFF) led_trigger_remove(led_cdev); led_set_brightness(led_cdev, state); ret = size; unlock: mutex_unlock(&led_cdev->led_access); return ret; } static DEVICE_ATTR_RW(brightness); static ssize_t max_brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned int max_brightness; mutex_lock(&led_cdev->led_access); max_brightness = led_cdev->max_brightness; mutex_unlock(&led_cdev->led_access); return sprintf(buf, "%u\n", max_brightness); } static DEVICE_ATTR_RO(max_brightness); #ifdef CONFIG_LEDS_TRIGGERS static const BIN_ATTR(trigger, 0644, led_trigger_read, led_trigger_write, 0); static const struct bin_attribute *const led_trigger_bin_attrs[] = { &bin_attr_trigger, NULL, }; static const struct attribute_group led_trigger_group = { .bin_attrs_new = led_trigger_bin_attrs, }; #endif static struct attribute *led_class_attrs[] = { &dev_attr_brightness.attr, &dev_attr_max_brightness.attr, NULL, }; static const struct attribute_group led_group = { .attrs = led_class_attrs, }; static const struct attribute_group *led_groups[] = { &led_group, #ifdef CONFIG_LEDS_TRIGGERS &led_trigger_group, #endif NULL, }; #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED static ssize_t brightness_hw_changed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->brightness_hw_changed == -1) return -ENODATA; return sprintf(buf, "%u\n", led_cdev->brightness_hw_changed); } static DEVICE_ATTR_RO(brightness_hw_changed); static int led_add_brightness_hw_changed(struct led_classdev *led_cdev) { struct device *dev = led_cdev->dev; int ret; ret = device_create_file(dev, &dev_attr_brightness_hw_changed); if (ret) { dev_err(dev, "Error creating brightness_hw_changed\n"); return ret; } led_cdev->brightness_hw_changed_kn = sysfs_get_dirent(dev->kobj.sd, "brightness_hw_changed"); if (!led_cdev->brightness_hw_changed_kn) { dev_err(dev, "Error getting brightness_hw_changed kn\n"); device_remove_file(dev, &dev_attr_brightness_hw_changed); return -ENXIO; } return 0; } static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev) { sysfs_put(led_cdev->brightness_hw_changed_kn); device_remove_file(led_cdev->dev, &dev_attr_brightness_hw_changed); } void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev, unsigned int brightness) { if (WARN_ON(!led_cdev->brightness_hw_changed_kn)) return; led_cdev->brightness_hw_changed = brightness; sysfs_notify_dirent(led_cdev->brightness_hw_changed_kn); } EXPORT_SYMBOL_GPL(led_classdev_notify_brightness_hw_changed); #else static int led_add_brightness_hw_changed(struct led_classdev *led_cdev) { return 0; } static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev) { } #endif /** * led_classdev_suspend - suspend an led_classdev. * @led_cdev: the led_classdev to suspend. */ void led_classdev_suspend(struct led_classdev *led_cdev) { led_cdev->flags |= LED_SUSPENDED; led_set_brightness_nopm(led_cdev, 0); flush_work(&led_cdev->set_brightness_work); } EXPORT_SYMBOL_GPL(led_classdev_suspend); /** * led_classdev_resume - resume an led_classdev. * @led_cdev: the led_classdev to resume. */ void led_classdev_resume(struct led_classdev *led_cdev) { led_set_brightness_nopm(led_cdev, led_cdev->brightness); if (led_cdev->flash_resume) led_cdev->flash_resume(led_cdev); led_cdev->flags &= ~LED_SUSPENDED; } EXPORT_SYMBOL_GPL(led_classdev_resume); #ifdef CONFIG_PM_SLEEP static int led_suspend(struct device *dev) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->flags & LED_CORE_SUSPENDRESUME) led_classdev_suspend(led_cdev); return 0; } static int led_resume(struct device *dev) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->flags & LED_CORE_SUSPENDRESUME) led_classdev_resume(led_cdev); return 0; } #endif static SIMPLE_DEV_PM_OPS(leds_class_dev_pm_ops, led_suspend, led_resume); static struct led_classdev *led_module_get(struct device *led_dev) { struct led_classdev *led_cdev; if (!led_dev) return ERR_PTR(-EPROBE_DEFER); led_cdev = dev_get_drvdata(led_dev); if (!try_module_get(led_cdev->dev->parent->driver->owner)) { put_device(led_cdev->dev); return ERR_PTR(-ENODEV); } return led_cdev; } static const struct class leds_class = { .name = "leds", .dev_groups = led_groups, .pm = &leds_class_dev_pm_ops, }; /** * of_led_get() - request a LED device via the LED framework * @np: device node to get the LED device from * @index: the index of the LED * * Returns the LED device parsed from the phandle specified in the "leds" * property of a device tree node or a negative error-code on failure. */ struct led_classdev *of_led_get(struct device_node *np, int index) { struct device *led_dev; struct device_node *led_node; led_node = of_parse_phandle(np, "leds", index); if (!led_node) return ERR_PTR(-ENOENT); led_dev = class_find_device_by_of_node(&leds_class, led_node); of_node_put(led_node); return led_module_get(led_dev); } EXPORT_SYMBOL_GPL(of_led_get); /** * led_put() - release a LED device * @led_cdev: LED device */ void led_put(struct led_classdev *led_cdev) { module_put(led_cdev->dev->parent->driver->owner); put_device(led_cdev->dev); } EXPORT_SYMBOL_GPL(led_put); static void devm_led_release(struct device *dev, void *res) { struct led_classdev **p = res; led_put(*p); } static struct led_classdev *__devm_led_get(struct device *dev, struct led_classdev *led) { struct led_classdev **dr; dr = devres_alloc(devm_led_release, sizeof(struct led_classdev *), GFP_KERNEL); if (!dr) { led_put(led); return ERR_PTR(-ENOMEM); } *dr = led; devres_add(dev, dr); return led; } /** * devm_of_led_get - Resource-managed request of a LED device * @dev: LED consumer * @index: index of the LED to obtain in the consumer * * The device node of the device is parse to find the request LED device. * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *__must_check devm_of_led_get(struct device *dev, int index) { struct led_classdev *led; if (!dev) return ERR_PTR(-EINVAL); led = of_led_get(dev->of_node, index); if (IS_ERR(led)) return led; return __devm_led_get(dev, led); } EXPORT_SYMBOL_GPL(devm_of_led_get); /** * led_get() - request a LED device via the LED framework * @dev: device for which to get the LED device * @con_id: name of the LED from the device's point of view * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *led_get(struct device *dev, char *con_id) { struct led_lookup_data *lookup; const char *provider = NULL; struct device *led_dev; mutex_lock(&leds_lookup_lock); list_for_each_entry(lookup, &leds_lookup_list, list) { if (!strcmp(lookup->dev_id, dev_name(dev)) && !strcmp(lookup->con_id, con_id)) { provider = kstrdup_const(lookup->provider, GFP_KERNEL); break; } } mutex_unlock(&leds_lookup_lock); if (!provider) return ERR_PTR(-ENOENT); led_dev = class_find_device_by_name(&leds_class, provider); kfree_const(provider); return led_module_get(led_dev); } EXPORT_SYMBOL_GPL(led_get); /** * devm_led_get() - request a LED device via the LED framework * @dev: device for which to get the LED device * @con_id: name of the LED from the device's point of view * * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *devm_led_get(struct device *dev, char *con_id) { struct led_classdev *led; led = led_get(dev, con_id); if (IS_ERR(led)) return led; return __devm_led_get(dev, led); } EXPORT_SYMBOL_GPL(devm_led_get); /** * led_add_lookup() - Add a LED lookup table entry * @led_lookup: the lookup table entry to add * * Add a LED lookup table entry. On systems without devicetree the lookup table * is used by led_get() to find LEDs. */ void led_add_lookup(struct led_lookup_data *led_lookup) { mutex_lock(&leds_lookup_lock); list_add_tail(&led_lookup->list, &leds_lookup_list); mutex_unlock(&leds_lookup_lock); } EXPORT_SYMBOL_GPL(led_add_lookup); /** * led_remove_lookup() - Remove a LED lookup table entry * @led_lookup: the lookup table entry to remove */ void led_remove_lookup(struct led_lookup_data *led_lookup) { mutex_lock(&leds_lookup_lock); list_del(&led_lookup->list); mutex_unlock(&leds_lookup_lock); } EXPORT_SYMBOL_GPL(led_remove_lookup); /** * devm_of_led_get_optional - Resource-managed request of an optional LED device * @dev: LED consumer * @index: index of the LED to obtain in the consumer * * The device node of the device is parsed to find the requested LED device. * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device, ERR_PTR(errno) on failure and NULL if the * led was not found. */ struct led_classdev *__must_check devm_of_led_get_optional(struct device *dev, int index) { struct led_classdev *led; led = devm_of_led_get(dev, index); if (IS_ERR(led) && PTR_ERR(led) == -ENOENT) return NULL; return led; } EXPORT_SYMBOL_GPL(devm_of_led_get_optional); static int led_classdev_next_name(const char *init_name, char *name, size_t len) { unsigned int i = 0; int ret = 0; struct device *dev; strscpy(name, init_name, len); while ((ret < len) && (dev = class_find_device_by_name(&leds_class, name))) { put_device(dev); ret = snprintf(name, len, "%s_%u", init_name, ++i); } if (ret >= len) return -ENOMEM; return i; } /** * led_classdev_register_ext - register a new object of led_classdev class * with init data. * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. * @init_data: LED class device initialization data */ int led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data) { char composed_name[LED_MAX_NAME_SIZE]; char final_name[LED_MAX_NAME_SIZE]; const char *proposed_name = composed_name; int ret; if (init_data) { if (init_data->devname_mandatory && !init_data->devicename) { dev_err(parent, "Mandatory device name is missing"); return -EINVAL; } ret = led_compose_name(parent, init_data, composed_name); if (ret < 0) return ret; if (init_data->fwnode) { fwnode_property_read_string(init_data->fwnode, "linux,default-trigger", &led_cdev->default_trigger); if (fwnode_property_present(init_data->fwnode, "retain-state-shutdown")) led_cdev->flags |= LED_RETAIN_AT_SHUTDOWN; fwnode_property_read_u32(init_data->fwnode, "max-brightness", &led_cdev->max_brightness); if (fwnode_property_present(init_data->fwnode, "color")) fwnode_property_read_u32(init_data->fwnode, "color", &led_cdev->color); } } else { proposed_name = led_cdev->name; } ret = led_classdev_next_name(proposed_name, final_name, sizeof(final_name)); if (ret < 0) return ret; else if (ret && led_cdev->flags & LED_REJECT_NAME_CONFLICT) return -EEXIST; else if (ret) dev_warn(parent, "Led %s renamed to %s due to name collision\n", proposed_name, final_name); if (led_cdev->color >= LED_COLOR_ID_MAX) dev_warn(parent, "LED %s color identifier out of range\n", final_name); mutex_init(&led_cdev->led_access); mutex_lock(&led_cdev->led_access); led_cdev->dev = device_create_with_groups(&leds_class, parent, 0, led_cdev, led_cdev->groups, "%s", final_name); if (IS_ERR(led_cdev->dev)) { mutex_unlock(&led_cdev->led_access); return PTR_ERR(led_cdev->dev); } if (init_data && init_data->fwnode) device_set_node(led_cdev->dev, init_data->fwnode); if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) { ret = led_add_brightness_hw_changed(led_cdev); if (ret) { device_unregister(led_cdev->dev); led_cdev->dev = NULL; mutex_unlock(&led_cdev->led_access); return ret; } } led_cdev->work_flags = 0; #ifdef CONFIG_LEDS_TRIGGERS init_rwsem(&led_cdev->trigger_lock); #endif #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED led_cdev->brightness_hw_changed = -1; #endif /* add to the list of leds */ down_write(&leds_list_lock); list_add_tail(&led_cdev->node, &leds_list); up_write(&leds_list_lock); if (!led_cdev->max_brightness) led_cdev->max_brightness = LED_FULL; led_update_brightness(led_cdev); led_cdev->wq = leds_wq; led_init_core(led_cdev); #ifdef CONFIG_LEDS_TRIGGERS led_trigger_set_default(led_cdev); #endif mutex_unlock(&led_cdev->led_access); dev_dbg(parent, "Registered led device: %s\n", led_cdev->name); return 0; } EXPORT_SYMBOL_GPL(led_classdev_register_ext); /** * led_classdev_unregister - unregisters a object of led_properties class. * @led_cdev: the led device to unregister * * Unregisters a previously registered via led_classdev_register object. */ void led_classdev_unregister(struct led_classdev *led_cdev) { if (IS_ERR_OR_NULL(led_cdev->dev)) return; #ifdef CONFIG_LEDS_TRIGGERS down_write(&led_cdev->trigger_lock); if (led_cdev->trigger) led_trigger_set(led_cdev, NULL); up_write(&led_cdev->trigger_lock); #endif led_cdev->flags |= LED_UNREGISTERING; /* Stop blinking */ led_stop_software_blink(led_cdev); if (!(led_cdev->flags & LED_RETAIN_AT_SHUTDOWN)) led_set_brightness(led_cdev, LED_OFF); flush_work(&led_cdev->set_brightness_work); if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) led_remove_brightness_hw_changed(led_cdev); device_unregister(led_cdev->dev); down_write(&leds_list_lock); list_del(&led_cdev->node); up_write(&leds_list_lock); mutex_destroy(&led_cdev->led_access); } EXPORT_SYMBOL_GPL(led_classdev_unregister); static void devm_led_classdev_release(struct device *dev, void *res) { led_classdev_unregister(*(struct led_classdev **)res); } /** * devm_led_classdev_register_ext - resource managed led_classdev_register_ext() * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. * @init_data: LED class device initialization data */ int devm_led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data) { struct led_classdev **dr; int rc; dr = devres_alloc(devm_led_classdev_release, sizeof(*dr), GFP_KERNEL); if (!dr) return -ENOMEM; rc = led_classdev_register_ext(parent, led_cdev, init_data); if (rc) { devres_free(dr); return rc; } *dr = led_cdev; devres_add(parent, dr); return 0; } EXPORT_SYMBOL_GPL(devm_led_classdev_register_ext); static int devm_led_classdev_match(struct device *dev, void *res, void *data) { struct led_classdev **p = res; if (WARN_ON(!p || !*p)) return 0; return *p == data; } /** * devm_led_classdev_unregister() - resource managed led_classdev_unregister() * @dev: The device to unregister. * @led_cdev: the led_classdev structure for this device. */ void devm_led_classdev_unregister(struct device *dev, struct led_classdev *led_cdev) { WARN_ON(devres_release(dev, devm_led_classdev_release, devm_led_classdev_match, led_cdev)); } EXPORT_SYMBOL_GPL(devm_led_classdev_unregister); static int __init leds_init(void) { leds_wq = alloc_ordered_workqueue("leds", 0); if (!leds_wq) { pr_err("Failed to create LEDs ordered workqueue\n"); return -ENOMEM; } return class_register(&leds_class); } static void __exit leds_exit(void) { class_unregister(&leds_class); destroy_workqueue(leds_wq); } subsys_initcall(leds_init); module_exit(leds_exit); MODULE_AUTHOR("John Lenz, Richard Purdie"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LED Class Interface"); |
| 41 41 41 41 41 41 41 390 433 432 40 391 392 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 391 430 433 430 432 41 392 430 391 41 41 35 432 433 433 432 432 432 430 432 433 433 430 431 433 433 433 431 430 432 433 41 390 432 433 390 390 392 392 391 41 41 41 41 41 41 41 41 41 41 41 41 40 41 41 41 41 40 41 41 41 41 41 41 41 41 41 41 40 41 41 41 37 26 34 37 37 37 37 37 27 10 24 13 27 10 41 41 41 1 28 10 1 35 36 36 41 41 41 41 41 41 36 41 6 35 41 41 41 41 41 41 41 41 41 41 41 1 35 35 41 41 40 41 41 41 41 41 41 41 41 41 41 41 35 40 41 40 41 35 35 41 35 35 41 41 41 41 41 41 35 41 41 41 41 41 41 41 41 40 41 41 41 41 41 41 35 41 433 41 433 433 433 432 433 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 | // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */ /*-************************************* * Dependencies ***************************************/ #include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ #include "../common/mem.h" #include "../common/error_private.h" #include "hist.h" /* HIST_countFast_wksp */ #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ #include "../common/fse.h" #include "../common/huf.h" #include "zstd_compress_internal.h" #include "zstd_compress_sequences.h" #include "zstd_compress_literals.h" #include "zstd_fast.h" #include "zstd_double_fast.h" #include "zstd_lazy.h" #include "zstd_opt.h" #include "zstd_ldm.h" #include "zstd_compress_superblock.h" #include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ /* *************************************************************** * Tuning parameters *****************************************************************/ /*! * COMPRESS_HEAPMODE : * Select how default decompression function ZSTD_compress() allocates its context, * on stack (0, default), or into heap (1). * Note that functions with explicit context such as ZSTD_compressCCtx() are unaffected. */ /*! * ZSTD_HASHLOG3_MAX : * Maximum size of the hash table dedicated to find 3-bytes matches, * in log format, aka 17 => 1 << 17 == 128Ki positions. * This structure is only used in zstd_opt. * Since allocation is centralized for all strategies, it has to be known here. * The actual (selected) size of the hash table is then stored in ZSTD_MatchState_t.hashLog3, * so that zstd_opt.c doesn't need to know about this constant. */ #ifndef ZSTD_HASHLOG3_MAX # define ZSTD_HASHLOG3_MAX 17 #endif /*-************************************* * Helper functions ***************************************/ /* ZSTD_compressBound() * Note that the result from this function is only valid for * the one-pass compression functions. * When employing the streaming mode, * if flushes are frequently altering the size of blocks, * the overhead from block headers can make the compressed data larger * than the return value of ZSTD_compressBound(). */ size_t ZSTD_compressBound(size_t srcSize) { size_t const r = ZSTD_COMPRESSBOUND(srcSize); if (r==0) return ERROR(srcSize_wrong); return r; } /*-************************************* * Context memory management ***************************************/ struct ZSTD_CDict_s { const void* dictContent; size_t dictContentSize; ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */ U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */ ZSTD_cwksp workspace; ZSTD_MatchState_t matchState; ZSTD_compressedBlockState_t cBlockState; ZSTD_customMem customMem; U32 dictID; int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ ZSTD_ParamSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use * row-based matchfinder. Unless the cdict is reloaded, we will use * the same greedy/lazy matchfinder at compression time. */ }; /* typedef'd to ZSTD_CDict within "zstd.h" */ ZSTD_CCtx* ZSTD_createCCtx(void) { return ZSTD_createCCtx_advanced(ZSTD_defaultCMem); } static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager) { assert(cctx != NULL); ZSTD_memset(cctx, 0, sizeof(*cctx)); cctx->customMem = memManager; cctx->bmi2 = ZSTD_cpuSupportsBmi2(); { size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters); assert(!ZSTD_isError(err)); (void)err; } } ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem) { ZSTD_STATIC_ASSERT(zcss_init==0); ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1)); if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; { ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_customMalloc(sizeof(ZSTD_CCtx), customMem); if (!cctx) return NULL; ZSTD_initCCtx(cctx, customMem); return cctx; } } ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize) { ZSTD_cwksp ws; ZSTD_CCtx* cctx; if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL; /* minimum size */ if ((size_t)workspace & 7) return NULL; /* must be 8-aligned */ ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc); cctx = (ZSTD_CCtx*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CCtx)); if (cctx == NULL) return NULL; ZSTD_memset(cctx, 0, sizeof(ZSTD_CCtx)); ZSTD_cwksp_move(&cctx->workspace, &ws); cctx->staticSize = workspaceSize; /* statically sized space. tmpWorkspace never moves (but prev/next block swap places) */ if (!ZSTD_cwksp_check_available(&cctx->workspace, TMP_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE); cctx->tmpWkspSize = TMP_WORKSPACE_SIZE; cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); return cctx; } /* * Clears and frees all of the dictionaries in the CCtx. */ static void ZSTD_clearAllDicts(ZSTD_CCtx* cctx) { ZSTD_customFree(cctx->localDict.dictBuffer, cctx->customMem); ZSTD_freeCDict(cctx->localDict.cdict); ZSTD_memset(&cctx->localDict, 0, sizeof(cctx->localDict)); ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); cctx->cdict = NULL; } static size_t ZSTD_sizeof_localDict(ZSTD_localDict dict) { size_t const bufferSize = dict.dictBuffer != NULL ? dict.dictSize : 0; size_t const cdictSize = ZSTD_sizeof_CDict(dict.cdict); return bufferSize + cdictSize; } static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) { assert(cctx != NULL); assert(cctx->staticSize == 0); ZSTD_clearAllDicts(cctx); ZSTD_cwksp_free(&cctx->workspace, cctx->customMem); } size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) { DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx); if (cctx==NULL) return 0; /* support free on NULL */ RETURN_ERROR_IF(cctx->staticSize, memory_allocation, "not compatible with static CCtx"); { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); ZSTD_freeCCtxContent(cctx); if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem); } return 0; } static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx) { (void)cctx; return 0; } size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx) { if (cctx==NULL) return 0; /* support sizeof on NULL */ /* cctx may be in the workspace */ return (cctx->workspace.workspace == cctx ? 0 : sizeof(*cctx)) + ZSTD_cwksp_sizeof(&cctx->workspace) + ZSTD_sizeof_localDict(cctx->localDict) + ZSTD_sizeof_mtctx(cctx); } size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) { return ZSTD_sizeof_CCtx(zcs); /* same object */ } /* private API call, for dictBuilder only */ const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } /* Returns true if the strategy supports using a row based matchfinder */ static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { return (strategy >= ZSTD_greedy && strategy <= ZSTD_lazy2); } /* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder * for this compression. */ static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_ParamSwitch_e mode) { assert(mode != ZSTD_ps_auto); return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable); } /* Returns row matchfinder usage given an initial mode and cParams */ static ZSTD_ParamSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_ParamSwitch_e mode, const ZSTD_compressionParameters* const cParams) { /* The Linux Kernel does not use SIMD, and 128KB is a very common size, e.g. in BtrFS. * The row match finder is slower for this size without SIMD, so disable it. */ const unsigned kWindowLogLowerBound = 17; if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */ mode = ZSTD_ps_disable; if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode; if (cParams->windowLog > kWindowLogLowerBound) mode = ZSTD_ps_enable; return mode; } /* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */ static ZSTD_ParamSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_ParamSwitch_e mode, const ZSTD_compressionParameters* const cParams) { if (mode != ZSTD_ps_auto) return mode; return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable; } /* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, const ZSTD_ParamSwitch_e useRowMatchFinder, const U32 forDDSDict) { assert(useRowMatchFinder != ZSTD_ps_auto); /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate. * We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder. */ return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); } /* Returns ZSTD_ps_enable if compression parameters are such that we should * enable long distance matching (wlog >= 27, strategy >= btopt). * Returns ZSTD_ps_disable otherwise. */ static ZSTD_ParamSwitch_e ZSTD_resolveEnableLdm(ZSTD_ParamSwitch_e mode, const ZSTD_compressionParameters* const cParams) { if (mode != ZSTD_ps_auto) return mode; return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; } static int ZSTD_resolveExternalSequenceValidation(int mode) { return mode; } /* Resolves maxBlockSize to the default if no value is present. */ static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) { if (maxBlockSize == 0) { return ZSTD_BLOCKSIZE_MAX; } else { return maxBlockSize; } } static ZSTD_ParamSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_ParamSwitch_e value, int cLevel) { if (value != ZSTD_ps_auto) return value; if (cLevel < 10) { return ZSTD_ps_disable; } else { return ZSTD_ps_enable; } } /* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast; } static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( ZSTD_compressionParameters cParams) { ZSTD_CCtx_params cctxParams; /* should not matter, as all cParams are presumed properly defined */ ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT); cctxParams.cParams = cParams; /* Adjust advanced params according to cParams */ cctxParams.ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams.ldmParams.enableLdm, &cParams); if (cctxParams.ldmParams.enableLdm == ZSTD_ps_enable) { ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams); assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog); assert(cctxParams.ldmParams.hashRateLog < 32); } cctxParams.postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.postBlockSplitter, &cParams); cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes, cctxParams.compressionLevel); assert(!ZSTD_checkCParams(cParams)); return cctxParams; } static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced( ZSTD_customMem customMem) { ZSTD_CCtx_params* params; if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; params = (ZSTD_CCtx_params*)ZSTD_customCalloc( sizeof(ZSTD_CCtx_params), customMem); if (!params) { return NULL; } ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT); params->customMem = customMem; return params; } ZSTD_CCtx_params* ZSTD_createCCtxParams(void) { return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem); } size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params) { if (params == NULL) { return 0; } ZSTD_customFree(params, params->customMem); return 0; } size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params) { return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT); } size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) { RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!"); ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); cctxParams->compressionLevel = compressionLevel; cctxParams->fParams.contentSizeFlag = 1; return 0; } #define ZSTD_NO_CLEVEL 0 /* * Initializes `cctxParams` from `params` and `compressionLevel`. * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. */ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params, int compressionLevel) { assert(!ZSTD_checkCParams(params->cParams)); ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); cctxParams->cParams = params->cParams; cctxParams->fParams = params->fParams; /* Should not matter, as all cParams are presumed properly defined. * But, set it for tracing anyway. */ cctxParams->compressionLevel = compressionLevel; cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); cctxParams->postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->postBlockSplitter, ¶ms->cParams); cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", cctxParams->useRowMatchFinder, cctxParams->postBlockSplitter, cctxParams->ldmParams.enableLdm); } size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) { RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!"); FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); ZSTD_CCtxParams_init_internal(cctxParams, ¶ms, ZSTD_NO_CLEVEL); return 0; } /* * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. * @param params Validated zstd parameters. */ static void ZSTD_CCtxParams_setZstdParams( ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) { assert(!ZSTD_checkCParams(params->cParams)); cctxParams->cParams = params->cParams; cctxParams->fParams = params->fParams; /* Should not matter, as all cParams are presumed properly defined. * But, set it for tracing anyway. */ cctxParams->compressionLevel = ZSTD_NO_CLEVEL; } ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) { ZSTD_bounds bounds = { 0, 0, 0 }; switch(param) { case ZSTD_c_compressionLevel: bounds.lowerBound = ZSTD_minCLevel(); bounds.upperBound = ZSTD_maxCLevel(); return bounds; case ZSTD_c_windowLog: bounds.lowerBound = ZSTD_WINDOWLOG_MIN; bounds.upperBound = ZSTD_WINDOWLOG_MAX; return bounds; case ZSTD_c_hashLog: bounds.lowerBound = ZSTD_HASHLOG_MIN; bounds.upperBound = ZSTD_HASHLOG_MAX; return bounds; case ZSTD_c_chainLog: bounds.lowerBound = ZSTD_CHAINLOG_MIN; bounds.upperBound = ZSTD_CHAINLOG_MAX; return bounds; case ZSTD_c_searchLog: bounds.lowerBound = ZSTD_SEARCHLOG_MIN; bounds.upperBound = ZSTD_SEARCHLOG_MAX; return bounds; case ZSTD_c_minMatch: bounds.lowerBound = ZSTD_MINMATCH_MIN; bounds.upperBound = ZSTD_MINMATCH_MAX; return bounds; case ZSTD_c_targetLength: bounds.lowerBound = ZSTD_TARGETLENGTH_MIN; bounds.upperBound = ZSTD_TARGETLENGTH_MAX; return bounds; case ZSTD_c_strategy: bounds.lowerBound = ZSTD_STRATEGY_MIN; bounds.upperBound = ZSTD_STRATEGY_MAX; return bounds; case ZSTD_c_contentSizeFlag: bounds.lowerBound = 0; bounds.upperBound = 1; return bounds; case ZSTD_c_checksumFlag: bounds.lowerBound = 0; bounds.upperBound = 1; return bounds; case ZSTD_c_dictIDFlag: bounds.lowerBound = 0; bounds.upperBound = 1; return bounds; case ZSTD_c_nbWorkers: bounds.lowerBound = 0; bounds.upperBound = 0; return bounds; case ZSTD_c_jobSize: bounds.lowerBound = 0; bounds.upperBound = 0; return bounds; case ZSTD_c_overlapLog: bounds.lowerBound = 0; bounds.upperBound = 0; return bounds; case ZSTD_c_enableDedicatedDictSearch: bounds.lowerBound = 0; bounds.upperBound = 1; return bounds; case ZSTD_c_enableLongDistanceMatching: bounds.lowerBound = (int)ZSTD_ps_auto; bounds.upperBound = (int)ZSTD_ps_disable; return bounds; case ZSTD_c_ldmHashLog: bounds.lowerBound = ZSTD_LDM_HASHLOG_MIN; bounds.upperBound = ZSTD_LDM_HASHLOG_MAX; return bounds; case ZSTD_c_ldmMinMatch: bounds.lowerBound = ZSTD_LDM_MINMATCH_MIN; bounds.upperBound = ZSTD_LDM_MINMATCH_MAX; return bounds; case ZSTD_c_ldmBucketSizeLog: bounds.lowerBound = ZSTD_LDM_BUCKETSIZELOG_MIN; bounds.upperBound = ZSTD_LDM_BUCKETSIZELOG_MAX; return bounds; case ZSTD_c_ldmHashRateLog: bounds.lowerBound = ZSTD_LDM_HASHRATELOG_MIN; bounds.upperBound = ZSTD_LDM_HASHRATELOG_MAX; return bounds; /* experimental parameters */ case ZSTD_c_rsyncable: bounds.lowerBound = 0; bounds.upperBound = 1; return bounds; case ZSTD_c_forceMaxWindow : bounds.lowerBound = 0; bounds.upperBound = 1; return bounds; case ZSTD_c_format: ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless); bounds.lowerBound = ZSTD_f_zstd1; bounds.upperBound = ZSTD_f_zstd1_magicless; /* note : how to ensure at compile time that this is the highest value enum ? */ return bounds; case ZSTD_c_forceAttachDict: ZSTD_STATIC_ASSERT(ZSTD_dictDefaultAttach < ZSTD_dictForceLoad); bounds.lowerBound = ZSTD_dictDefaultAttach; bounds.upperBound = ZSTD_dictForceLoad; /* note : how to ensure at compile time that this is the highest value enum ? */ return bounds; case ZSTD_c_literalCompressionMode: ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable < ZSTD_ps_disable); bounds.lowerBound = (int)ZSTD_ps_auto; bounds.upperBound = (int)ZSTD_ps_disable; return bounds; case ZSTD_c_targetCBlockSize: bounds.lowerBound = ZSTD_TARGETCBLOCKSIZE_MIN; bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX; return bounds; case ZSTD_c_srcSizeHint: bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN; bounds.upperBound = ZSTD_SRCSIZEHINT_MAX; return bounds; case ZSTD_c_stableInBuffer: case ZSTD_c_stableOutBuffer: bounds.lowerBound = (int)ZSTD_bm_buffered; bounds.upperBound = (int)ZSTD_bm_stable; return bounds; case ZSTD_c_blockDelimiters: bounds.lowerBound = (int)ZSTD_sf_noBlockDelimiters; bounds.upperBound = (int)ZSTD_sf_explicitBlockDelimiters; return bounds; case ZSTD_c_validateSequences: bounds.lowerBound = 0; bounds.upperBound = 1; return bounds; case ZSTD_c_splitAfterSequences: bounds.lowerBound = (int)ZSTD_ps_auto; bounds.upperBound = (int)ZSTD_ps_disable; return bounds; case ZSTD_c_blockSplitterLevel: bounds.lowerBound = 0; bounds.upperBound = ZSTD_BLOCKSPLITTER_LEVEL_MAX; return bounds; case ZSTD_c_useRowMatchFinder: bounds.lowerBound = (int)ZSTD_ps_auto; bounds.upperBound = (int)ZSTD_ps_disable; return bounds; case ZSTD_c_deterministicRefPrefix: bounds.lowerBound = 0; bounds.upperBound = 1; return bounds; case ZSTD_c_prefetchCDictTables: bounds.lowerBound = (int)ZSTD_ps_auto; bounds.upperBound = (int)ZSTD_ps_disable; return bounds; case ZSTD_c_enableSeqProducerFallback: bounds.lowerBound = 0; bounds.upperBound = 1; return bounds; case ZSTD_c_maxBlockSize: bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; bounds.upperBound = ZSTD_BLOCKSIZE_MAX; return bounds; case ZSTD_c_repcodeResolution: bounds.lowerBound = (int)ZSTD_ps_auto; bounds.upperBound = (int)ZSTD_ps_disable; return bounds; default: bounds.error = ERROR(parameter_unsupported); return bounds; } } /* ZSTD_cParam_clampBounds: * Clamps the value into the bounded range. */ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) { ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); if (ZSTD_isError(bounds.error)) return bounds.error; if (*value < bounds.lowerBound) *value = bounds.lowerBound; if (*value > bounds.upperBound) *value = bounds.upperBound; return 0; } #define BOUNDCHECK(cParam, val) \ do { \ RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ parameter_outOfBound, "Param out of bounds"); \ } while (0) static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) { switch(param) { case ZSTD_c_compressionLevel: case ZSTD_c_hashLog: case ZSTD_c_chainLog: case ZSTD_c_searchLog: case ZSTD_c_minMatch: case ZSTD_c_targetLength: case ZSTD_c_strategy: case ZSTD_c_blockSplitterLevel: return 1; case ZSTD_c_format: case ZSTD_c_windowLog: case ZSTD_c_contentSizeFlag: case ZSTD_c_checksumFlag: case ZSTD_c_dictIDFlag: case ZSTD_c_forceMaxWindow : case ZSTD_c_nbWorkers: case ZSTD_c_jobSize: case ZSTD_c_overlapLog: case ZSTD_c_rsyncable: case ZSTD_c_enableDedicatedDictSearch: case ZSTD_c_enableLongDistanceMatching: case ZSTD_c_ldmHashLog: case ZSTD_c_ldmMinMatch: case ZSTD_c_ldmBucketSizeLog: case ZSTD_c_ldmHashRateLog: case ZSTD_c_forceAttachDict: case ZSTD_c_literalCompressionMode: case ZSTD_c_targetCBlockSize: case ZSTD_c_srcSizeHint: case ZSTD_c_stableInBuffer: case ZSTD_c_stableOutBuffer: case ZSTD_c_blockDelimiters: case ZSTD_c_validateSequences: case ZSTD_c_splitAfterSequences: case ZSTD_c_useRowMatchFinder: case ZSTD_c_deterministicRefPrefix: case ZSTD_c_prefetchCDictTables: case ZSTD_c_enableSeqProducerFallback: case ZSTD_c_maxBlockSize: case ZSTD_c_repcodeResolution: default: return 0; } } size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) { DEBUGLOG(4, "ZSTD_CCtx_setParameter (%i, %i)", (int)param, value); if (cctx->streamStage != zcss_init) { if (ZSTD_isUpdateAuthorized(param)) { cctx->cParamsChanged = 1; } else { RETURN_ERROR(stage_wrong, "can only set params in cctx init stage"); } } switch(param) { case ZSTD_c_nbWorkers: RETURN_ERROR_IF((value!=0) && cctx->staticSize, parameter_unsupported, "MT not compatible with static alloc"); break; case ZSTD_c_compressionLevel: case ZSTD_c_windowLog: case ZSTD_c_hashLog: case ZSTD_c_chainLog: case ZSTD_c_searchLog: case ZSTD_c_minMatch: case ZSTD_c_targetLength: case ZSTD_c_strategy: case ZSTD_c_ldmHashRateLog: case ZSTD_c_format: case ZSTD_c_contentSizeFlag: case ZSTD_c_checksumFlag: case ZSTD_c_dictIDFlag: case ZSTD_c_forceMaxWindow: case ZSTD_c_forceAttachDict: case ZSTD_c_literalCompressionMode: case ZSTD_c_jobSize: case ZSTD_c_overlapLog: case ZSTD_c_rsyncable: case ZSTD_c_enableDedicatedDictSearch: case ZSTD_c_enableLongDistanceMatching: case ZSTD_c_ldmHashLog: case ZSTD_c_ldmMinMatch: case ZSTD_c_ldmBucketSizeLog: case ZSTD_c_targetCBlockSize: case ZSTD_c_srcSizeHint: case ZSTD_c_stableInBuffer: case ZSTD_c_stableOutBuffer: case ZSTD_c_blockDelimiters: case ZSTD_c_validateSequences: case ZSTD_c_splitAfterSequences: case ZSTD_c_blockSplitterLevel: case ZSTD_c_useRowMatchFinder: case ZSTD_c_deterministicRefPrefix: case ZSTD_c_prefetchCDictTables: case ZSTD_c_enableSeqProducerFallback: case ZSTD_c_maxBlockSize: case ZSTD_c_repcodeResolution: break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return ZSTD_CCtxParams_setParameter(&cctx->requestedParams, param, value); } size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, int value) { DEBUGLOG(4, "ZSTD_CCtxParams_setParameter (%i, %i)", (int)param, value); switch(param) { case ZSTD_c_format : BOUNDCHECK(ZSTD_c_format, value); CCtxParams->format = (ZSTD_format_e)value; return (size_t)CCtxParams->format; case ZSTD_c_compressionLevel : { FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); if (value == 0) CCtxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* 0 == default */ else CCtxParams->compressionLevel = value; if (CCtxParams->compressionLevel >= 0) return (size_t)CCtxParams->compressionLevel; return 0; /* return type (size_t) cannot represent negative values */ } case ZSTD_c_windowLog : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_windowLog, value); CCtxParams->cParams.windowLog = (U32)value; return CCtxParams->cParams.windowLog; case ZSTD_c_hashLog : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_hashLog, value); CCtxParams->cParams.hashLog = (U32)value; return CCtxParams->cParams.hashLog; case ZSTD_c_chainLog : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_chainLog, value); CCtxParams->cParams.chainLog = (U32)value; return CCtxParams->cParams.chainLog; case ZSTD_c_searchLog : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_searchLog, value); CCtxParams->cParams.searchLog = (U32)value; return (size_t)value; case ZSTD_c_minMatch : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_minMatch, value); CCtxParams->cParams.minMatch = (U32)value; return CCtxParams->cParams.minMatch; case ZSTD_c_targetLength : BOUNDCHECK(ZSTD_c_targetLength, value); CCtxParams->cParams.targetLength = (U32)value; return CCtxParams->cParams.targetLength; case ZSTD_c_strategy : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_strategy, value); CCtxParams->cParams.strategy = (ZSTD_strategy)value; return (size_t)CCtxParams->cParams.strategy; case ZSTD_c_contentSizeFlag : /* Content size written in frame header _when known_ (default:1) */ DEBUGLOG(4, "set content size flag = %u", (value!=0)); CCtxParams->fParams.contentSizeFlag = value != 0; return (size_t)CCtxParams->fParams.contentSizeFlag; case ZSTD_c_checksumFlag : /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ CCtxParams->fParams.checksumFlag = value != 0; return (size_t)CCtxParams->fParams.checksumFlag; case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); CCtxParams->fParams.noDictIDFlag = !value; return !CCtxParams->fParams.noDictIDFlag; case ZSTD_c_forceMaxWindow : CCtxParams->forceWindow = (value != 0); return (size_t)CCtxParams->forceWindow; case ZSTD_c_forceAttachDict : { const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); CCtxParams->attachDictPref = pref; return CCtxParams->attachDictPref; } case ZSTD_c_literalCompressionMode : { const ZSTD_ParamSwitch_e lcm = (ZSTD_ParamSwitch_e)value; BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); CCtxParams->literalCompressionMode = lcm; return CCtxParams->literalCompressionMode; } case ZSTD_c_nbWorkers : RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); return 0; case ZSTD_c_jobSize : RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); return 0; case ZSTD_c_overlapLog : RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); return 0; case ZSTD_c_rsyncable : RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); return 0; case ZSTD_c_enableDedicatedDictSearch : CCtxParams->enableDedicatedDictSearch = (value!=0); return (size_t)CCtxParams->enableDedicatedDictSearch; case ZSTD_c_enableLongDistanceMatching : BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); CCtxParams->ldmParams.enableLdm = (ZSTD_ParamSwitch_e)value; return CCtxParams->ldmParams.enableLdm; case ZSTD_c_ldmHashLog : if (value!=0) /* 0 ==> auto */ BOUNDCHECK(ZSTD_c_ldmHashLog, value); CCtxParams->ldmParams.hashLog = (U32)value; return CCtxParams->ldmParams.hashLog; case ZSTD_c_ldmMinMatch : if (value!=0) /* 0 ==> default */ BOUNDCHECK(ZSTD_c_ldmMinMatch, value); CCtxParams->ldmParams.minMatchLength = (U32)value; return CCtxParams->ldmParams.minMatchLength; case ZSTD_c_ldmBucketSizeLog : if (value!=0) /* 0 ==> default */ BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); CCtxParams->ldmParams.bucketSizeLog = (U32)value; return CCtxParams->ldmParams.bucketSizeLog; case ZSTD_c_ldmHashRateLog : if (value!=0) /* 0 ==> default */ BOUNDCHECK(ZSTD_c_ldmHashRateLog, value); CCtxParams->ldmParams.hashRateLog = (U32)value; return CCtxParams->ldmParams.hashRateLog; case ZSTD_c_targetCBlockSize : if (value!=0) { /* 0 ==> default */ value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN); BOUNDCHECK(ZSTD_c_targetCBlockSize, value); } CCtxParams->targetCBlockSize = (U32)value; return CCtxParams->targetCBlockSize; case ZSTD_c_srcSizeHint : if (value!=0) /* 0 ==> default */ BOUNDCHECK(ZSTD_c_srcSizeHint, value); CCtxParams->srcSizeHint = value; return (size_t)CCtxParams->srcSizeHint; case ZSTD_c_stableInBuffer: BOUNDCHECK(ZSTD_c_stableInBuffer, value); CCtxParams->inBufferMode = (ZSTD_bufferMode_e)value; return CCtxParams->inBufferMode; case ZSTD_c_stableOutBuffer: BOUNDCHECK(ZSTD_c_stableOutBuffer, value); CCtxParams->outBufferMode = (ZSTD_bufferMode_e)value; return CCtxParams->outBufferMode; case ZSTD_c_blockDelimiters: BOUNDCHECK(ZSTD_c_blockDelimiters, value); CCtxParams->blockDelimiters = (ZSTD_SequenceFormat_e)value; return CCtxParams->blockDelimiters; case ZSTD_c_validateSequences: BOUNDCHECK(ZSTD_c_validateSequences, value); CCtxParams->validateSequences = value; return (size_t)CCtxParams->validateSequences; case ZSTD_c_splitAfterSequences: BOUNDCHECK(ZSTD_c_splitAfterSequences, value); CCtxParams->postBlockSplitter = (ZSTD_ParamSwitch_e)value; return CCtxParams->postBlockSplitter; case ZSTD_c_blockSplitterLevel: BOUNDCHECK(ZSTD_c_blockSplitterLevel, value); CCtxParams->preBlockSplitter_level = value; return (size_t)CCtxParams->preBlockSplitter_level; case ZSTD_c_useRowMatchFinder: BOUNDCHECK(ZSTD_c_useRowMatchFinder, value); CCtxParams->useRowMatchFinder = (ZSTD_ParamSwitch_e)value; return CCtxParams->useRowMatchFinder; case ZSTD_c_deterministicRefPrefix: BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); CCtxParams->deterministicRefPrefix = !!value; return (size_t)CCtxParams->deterministicRefPrefix; case ZSTD_c_prefetchCDictTables: BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); CCtxParams->prefetchCDictTables = (ZSTD_ParamSwitch_e)value; return CCtxParams->prefetchCDictTables; case ZSTD_c_enableSeqProducerFallback: BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); CCtxParams->enableMatchFinderFallback = value; return (size_t)CCtxParams->enableMatchFinderFallback; case ZSTD_c_maxBlockSize: if (value!=0) /* 0 ==> default */ BOUNDCHECK(ZSTD_c_maxBlockSize, value); assert(value>=0); CCtxParams->maxBlockSize = (size_t)value; return CCtxParams->maxBlockSize; case ZSTD_c_repcodeResolution: BOUNDCHECK(ZSTD_c_repcodeResolution, value); CCtxParams->searchForExternalRepcodes = (ZSTD_ParamSwitch_e)value; return CCtxParams->searchForExternalRepcodes; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } } size_t ZSTD_CCtx_getParameter(ZSTD_CCtx const* cctx, ZSTD_cParameter param, int* value) { return ZSTD_CCtxParams_getParameter(&cctx->requestedParams, param, value); } size_t ZSTD_CCtxParams_getParameter( ZSTD_CCtx_params const* CCtxParams, ZSTD_cParameter param, int* value) { switch(param) { case ZSTD_c_format : *value = (int)CCtxParams->format; break; case ZSTD_c_compressionLevel : *value = CCtxParams->compressionLevel; break; case ZSTD_c_windowLog : *value = (int)CCtxParams->cParams.windowLog; break; case ZSTD_c_hashLog : *value = (int)CCtxParams->cParams.hashLog; break; case ZSTD_c_chainLog : *value = (int)CCtxParams->cParams.chainLog; break; case ZSTD_c_searchLog : *value = (int)CCtxParams->cParams.searchLog; break; case ZSTD_c_minMatch : *value = (int)CCtxParams->cParams.minMatch; break; case ZSTD_c_targetLength : *value = (int)CCtxParams->cParams.targetLength; break; case ZSTD_c_strategy : *value = (int)CCtxParams->cParams.strategy; break; case ZSTD_c_contentSizeFlag : *value = CCtxParams->fParams.contentSizeFlag; break; case ZSTD_c_checksumFlag : *value = CCtxParams->fParams.checksumFlag; break; case ZSTD_c_dictIDFlag : *value = !CCtxParams->fParams.noDictIDFlag; break; case ZSTD_c_forceMaxWindow : *value = CCtxParams->forceWindow; break; case ZSTD_c_forceAttachDict : *value = (int)CCtxParams->attachDictPref; break; case ZSTD_c_literalCompressionMode : *value = (int)CCtxParams->literalCompressionMode; break; case ZSTD_c_nbWorkers : assert(CCtxParams->nbWorkers == 0); *value = CCtxParams->nbWorkers; break; case ZSTD_c_jobSize : RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); case ZSTD_c_overlapLog : RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); case ZSTD_c_rsyncable : RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); case ZSTD_c_enableDedicatedDictSearch : *value = CCtxParams->enableDedicatedDictSearch; break; case ZSTD_c_enableLongDistanceMatching : *value = (int)CCtxParams->ldmParams.enableLdm; break; case ZSTD_c_ldmHashLog : *value = (int)CCtxParams->ldmParams.hashLog; break; case ZSTD_c_ldmMinMatch : *value = (int)CCtxParams->ldmParams.minMatchLength; break; case ZSTD_c_ldmBucketSizeLog : *value = (int)CCtxParams->ldmParams.bucketSizeLog; break; case ZSTD_c_ldmHashRateLog : *value = (int)CCtxParams->ldmParams.hashRateLog; break; case ZSTD_c_targetCBlockSize : *value = (int)CCtxParams->targetCBlockSize; break; case ZSTD_c_srcSizeHint : *value = (int)CCtxParams->srcSizeHint; break; case ZSTD_c_stableInBuffer : *value = (int)CCtxParams->inBufferMode; break; case ZSTD_c_stableOutBuffer : *value = (int)CCtxParams->outBufferMode; break; case ZSTD_c_blockDelimiters : *value = (int)CCtxParams->blockDelimiters; break; case ZSTD_c_validateSequences : *value = (int)CCtxParams->validateSequences; break; case ZSTD_c_splitAfterSequences : *value = (int)CCtxParams->postBlockSplitter; break; case ZSTD_c_blockSplitterLevel : *value = CCtxParams->preBlockSplitter_level; break; case ZSTD_c_useRowMatchFinder : *value = (int)CCtxParams->useRowMatchFinder; break; case ZSTD_c_deterministicRefPrefix: *value = (int)CCtxParams->deterministicRefPrefix; break; case ZSTD_c_prefetchCDictTables: *value = (int)CCtxParams->prefetchCDictTables; break; case ZSTD_c_enableSeqProducerFallback: *value = CCtxParams->enableMatchFinderFallback; break; case ZSTD_c_maxBlockSize: *value = (int)CCtxParams->maxBlockSize; break; case ZSTD_c_repcodeResolution: *value = (int)CCtxParams->searchForExternalRepcodes; break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return 0; } /* ZSTD_CCtx_setParametersUsingCCtxParams() : * just applies `params` into `cctx` * no action is performed, parameters are merely stored. * If ZSTDMT is enabled, parameters are pushed to cctx->mtctx. * This is possible even if a compression is ongoing. * In which case, new parameters will be applied on the fly, starting with next compression job. */ size_t ZSTD_CCtx_setParametersUsingCCtxParams( ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params) { DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams"); RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "The context is in the wrong stage!"); RETURN_ERROR_IF(cctx->cdict, stage_wrong, "Can't override parameters with cdict attached (some must " "be inherited from the cdict)."); cctx->requestedParams = *params; return 0; } size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) { ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */); DEBUGLOG(4, "ZSTD_CCtx_setCParams"); /* only update if all parameters are valid */ FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, (int)cparams.windowLog), ""); FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, (int)cparams.chainLog), ""); FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, (int)cparams.hashLog), ""); FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, (int)cparams.searchLog), ""); FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, (int)cparams.minMatch), ""); FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, (int)cparams.targetLength), ""); FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, (int)cparams.strategy), ""); return 0; } size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams) { ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */); DEBUGLOG(4, "ZSTD_CCtx_setFParams"); FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), ""); FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), ""); FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), ""); return 0; } size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params) { DEBUGLOG(4, "ZSTD_CCtx_setParams"); /* First check cParams, because we want to update all or none. */ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); /* Next set fParams, because this could fail if the cctx isn't in init stage. */ FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), ""); /* Finally set cParams, which should succeed. */ FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), ""); return 0; } size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) { DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize); RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "Can't set pledgedSrcSize when not in init stage."); cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; return 0; } static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams( int const compressionLevel, size_t const dictSize); static int ZSTD_dedicatedDictSearch_isSupported( const ZSTD_compressionParameters* cParams); static void ZSTD_dedicatedDictSearch_revertCParams( ZSTD_compressionParameters* cParams); /* * Initializes the local dictionary using requested parameters. * NOTE: Initialization does not employ the pledged src size, * because the dictionary may be used for multiple compressions. */ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) { ZSTD_localDict* const dl = &cctx->localDict; if (dl->dict == NULL) { /* No local dictionary. */ assert(dl->dictBuffer == NULL); assert(dl->cdict == NULL); assert(dl->dictSize == 0); return 0; } if (dl->cdict != NULL) { /* Local dictionary already initialized. */ assert(cctx->cdict == dl->cdict); return 0; } assert(dl->dictSize > 0); assert(cctx->cdict == NULL); assert(cctx->prefixDict.dict == NULL); dl->cdict = ZSTD_createCDict_advanced2( dl->dict, dl->dictSize, ZSTD_dlm_byRef, dl->dictContentType, &cctx->requestedParams, cctx->customMem); RETURN_ERROR_IF(!dl->cdict, memory_allocation, "ZSTD_createCDict_advanced failed"); cctx->cdict = dl->cdict; return 0; } size_t ZSTD_CCtx_loadDictionary_advanced( ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) { DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "Can't load a dictionary when cctx is not in init stage."); ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */ if (dict == NULL || dictSize == 0) /* no dictionary */ return 0; if (dictLoadMethod == ZSTD_dlm_byRef) { cctx->localDict.dict = dict; } else { /* copy dictionary content inside CCtx to own its lifetime */ void* dictBuffer; RETURN_ERROR_IF(cctx->staticSize, memory_allocation, "static CCtx can't allocate for an internal copy of dictionary"); dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem); RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation, "allocation failed for dictionary content"); ZSTD_memcpy(dictBuffer, dict, dictSize); cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */ cctx->localDict.dict = dictBuffer; /* read-only reference */ } cctx->localDict.dictSize = dictSize; cctx->localDict.dictContentType = dictContentType; return 0; } size_t ZSTD_CCtx_loadDictionary_byReference( ZSTD_CCtx* cctx, const void* dict, size_t dictSize) { return ZSTD_CCtx_loadDictionary_advanced( cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); } size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) { return ZSTD_CCtx_loadDictionary_advanced( cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); } size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) { RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "Can't ref a dict when ctx not in init stage."); /* Free the existing local cdict (if any) to save memory. */ ZSTD_clearAllDicts(cctx); cctx->cdict = cdict; return 0; } size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool) { RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "Can't ref a pool when ctx not in init stage."); cctx->pool = pool; return 0; } size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize) { return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent); } size_t ZSTD_CCtx_refPrefix_advanced( ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) { RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "Can't ref a prefix when ctx not in init stage."); ZSTD_clearAllDicts(cctx); if (prefix != NULL && prefixSize > 0) { cctx->prefixDict.dict = prefix; cctx->prefixDict.dictSize = prefixSize; cctx->prefixDict.dictContentType = dictContentType; } return 0; } /*! ZSTD_CCtx_reset() : * Also dumps dictionary */ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) { if ( (reset == ZSTD_reset_session_only) || (reset == ZSTD_reset_session_and_parameters) ) { cctx->streamStage = zcss_init; cctx->pledgedSrcSizePlusOne = 0; } if ( (reset == ZSTD_reset_parameters) || (reset == ZSTD_reset_session_and_parameters) ) { RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "Reset parameters is only possible during init stage."); ZSTD_clearAllDicts(cctx); return ZSTD_CCtxParams_reset(&cctx->requestedParams); } return 0; } /* ZSTD_checkCParams() : control CParam values remain within authorized range. @return : 0, or an error code if one value is beyond authorized range */ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) { BOUNDCHECK(ZSTD_c_windowLog, (int)cParams.windowLog); BOUNDCHECK(ZSTD_c_chainLog, (int)cParams.chainLog); BOUNDCHECK(ZSTD_c_hashLog, (int)cParams.hashLog); BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog); BOUNDCHECK(ZSTD_c_minMatch, (int)cParams.minMatch); BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength); BOUNDCHECK(ZSTD_c_strategy, (int)cParams.strategy); return 0; } /* ZSTD_clampCParams() : * make CParam values within valid range. * @return : valid CParams */ static ZSTD_compressionParameters ZSTD_clampCParams(ZSTD_compressionParameters cParams) { # define CLAMP_TYPE(cParam, val, type) \ do { \ ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound; \ else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \ } while (0) # define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) CLAMP(ZSTD_c_windowLog, cParams.windowLog); CLAMP(ZSTD_c_chainLog, cParams.chainLog); CLAMP(ZSTD_c_hashLog, cParams.hashLog); CLAMP(ZSTD_c_searchLog, cParams.searchLog); CLAMP(ZSTD_c_minMatch, cParams.minMatch); CLAMP(ZSTD_c_targetLength,cParams.targetLength); CLAMP_TYPE(ZSTD_c_strategy,cParams.strategy, ZSTD_strategy); return cParams; } /* ZSTD_cycleLog() : * condition for correct operation : hashLog > 1 */ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat) { U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2); return hashLog - btScale; } /* ZSTD_dictAndWindowLog() : * Returns an adjusted window log that is large enough to fit the source and the dictionary. * The zstd format says that the entire dictionary is valid if one byte of the dictionary * is within the window. So the hashLog and chainLog should be large enough to reference both * the dictionary and the window. So we must use this adjusted dictAndWindowLog when downsizing * the hashLog and windowLog. * NOTE: srcSize must not be ZSTD_CONTENTSIZE_UNKNOWN. */ static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize) { const U64 maxWindowSize = 1ULL << ZSTD_WINDOWLOG_MAX; /* No dictionary ==> No change */ if (dictSize == 0) { return windowLog; } assert(windowLog <= ZSTD_WINDOWLOG_MAX); assert(srcSize != ZSTD_CONTENTSIZE_UNKNOWN); /* Handled in ZSTD_adjustCParams_internal() */ { U64 const windowSize = 1ULL << windowLog; U64 const dictAndWindowSize = dictSize + windowSize; /* If the window size is already large enough to fit both the source and the dictionary * then just use the window size. Otherwise adjust so that it fits the dictionary and * the window. */ if (windowSize >= dictSize + srcSize) { return windowLog; /* Window size large enough already */ } else if (dictAndWindowSize >= maxWindowSize) { return ZSTD_WINDOWLOG_MAX; /* Larger than max window log */ } else { return ZSTD_highbit32((U32)dictAndWindowSize - 1) + 1; } } } /* ZSTD_adjustCParams_internal() : * optimize `cPar` for a specified input (`srcSize` and `dictSize`). * mostly downsize to reduce memory consumption and initialization latency. * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known. * `mode` is the mode for parameter adjustment. See docs for `ZSTD_CParamMode_e`. * note : `srcSize==0` means 0! * condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */ static ZSTD_compressionParameters ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize, ZSTD_CParamMode_e mode, ZSTD_ParamSwitch_e useRowMatchFinder) { const U64 minSrcSize = 513; /* (1<<9) + 1 */ const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); assert(ZSTD_checkCParams(cPar)==0); /* Cascade the selected strategy down to the next-highest one built into * this binary. */ #ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR if (cPar.strategy == ZSTD_btultra2) { cPar.strategy = ZSTD_btultra; } if (cPar.strategy == ZSTD_btultra) { cPar.strategy = ZSTD_btopt; } #endif #ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR if (cPar.strategy == ZSTD_btopt) { cPar.strategy = ZSTD_btlazy2; } #endif #ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR if (cPar.strategy == ZSTD_btlazy2) { cPar.strategy = ZSTD_lazy2; } #endif #ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR if (cPar.strategy == ZSTD_lazy2) { cPar.strategy = ZSTD_lazy; } #endif #ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR if (cPar.strategy == ZSTD_lazy) { cPar.strategy = ZSTD_greedy; } #endif #ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR if (cPar.strategy == ZSTD_greedy) { cPar.strategy = ZSTD_dfast; } #endif #ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR if (cPar.strategy == ZSTD_dfast) { cPar.strategy = ZSTD_fast; cPar.targetLength = 0; } #endif switch (mode) { case ZSTD_cpm_unknown: case ZSTD_cpm_noAttachDict: /* If we don't know the source size, don't make any * assumptions about it. We will already have selected * smaller parameters if a dictionary is in use. */ break; case ZSTD_cpm_createCDict: /* Assume a small source size when creating a dictionary * with an unknown source size. */ if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) srcSize = minSrcSize; break; case ZSTD_cpm_attachDict: /* Dictionary has its own dedicated parameters which have * already been selected. We are selecting parameters * for only the source. */ dictSize = 0; break; default: assert(0); break; } /* resize windowLog if input is small enough, to use less memory */ if ( (srcSize <= maxWindowResize) && (dictSize <= maxWindowResize) ) { U32 const tSize = (U32)(srcSize + dictSize); static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : ZSTD_highbit32(tSize-1) + 1; if (cPar.windowLog > srcLog) cPar.windowLog = srcLog; } if (srcSize != ZSTD_CONTENTSIZE_UNKNOWN) { U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize); U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); if (cPar.hashLog > dictAndWindowLog+1) cPar.hashLog = dictAndWindowLog+1; if (cycleLog > dictAndWindowLog) cPar.chainLog -= (cycleLog - dictAndWindowLog); } if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ /* We can't use more than 32 bits of hash in total, so that means that we require: * (hashLog + 8) <= 32 && (chainLog + 8) <= 32 */ if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) { U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS; if (cPar.hashLog > maxShortCacheHashLog) { cPar.hashLog = maxShortCacheHashLog; } if (cPar.chainLog > maxShortCacheHashLog) { cPar.chainLog = maxShortCacheHashLog; } } /* At this point, we aren't 100% sure if we are using the row match finder. * Unless it is explicitly disabled, conservatively assume that it is enabled. * In this case it will only be disabled for small sources, so shrinking the * hash log a little bit shouldn't result in any ratio loss. */ if (useRowMatchFinder == ZSTD_ps_auto) useRowMatchFinder = ZSTD_ps_enable; /* We can't hash more than 32-bits in total. So that means that we require: * (hashLog - rowLog + 8) <= 32 */ if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) { /* Switch to 32-entry rows if searchLog is 5 (or more) */ U32 const rowLog = BOUNDED(4, cPar.searchLog, 6); U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS; U32 const maxHashLog = maxRowHashLog + rowLog; assert(cPar.hashLog >= rowLog); if (cPar.hashLog > maxHashLog) { cPar.hashLog = maxHashLog; } } return cPar; } ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize) { cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); } static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); static void ZSTD_overrideCParams( ZSTD_compressionParameters* cParams, const ZSTD_compressionParameters* overrides) { if (overrides->windowLog) cParams->windowLog = overrides->windowLog; if (overrides->hashLog) cParams->hashLog = overrides->hashLog; if (overrides->chainLog) cParams->chainLog = overrides->chainLog; if (overrides->searchLog) cParams->searchLog = overrides->searchLog; if (overrides->minMatch) cParams->minMatch = overrides->minMatch; if (overrides->targetLength) cParams->targetLength = overrides->targetLength; if (overrides->strategy) cParams->strategy = overrides->strategy; } ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) { ZSTD_compressionParameters cParams; if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) { assert(CCtxParams->srcSizeHint>=0); srcSizeHint = (U64)CCtxParams->srcSizeHint; } cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode); if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); assert(!ZSTD_checkCParams(cParams)); /* srcSizeHint == 0 means 0 */ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder); } static size_t ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, const ZSTD_ParamSwitch_e useRowMatchFinder, const int enableDedicatedDictSearch, const U32 forCCtx) { /* chain table size should be 0 for fast or row-hash strategies */ size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, enableDedicatedDictSearch && !forCCtx) ? ((size_t)1 << cParams->chainLog) : 0; size_t const hSize = ((size_t)1) << cParams->hashLog; U32 const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; /* We don't use ZSTD_cwksp_alloc_size() here because the tables aren't * surrounded by redzones in ASAN. */ size_t const tableSpace = chainSize * sizeof(U32) + hSize * sizeof(U32) + h3Size * sizeof(U32); size_t const optPotentialSpace = ZSTD_cwksp_aligned64_alloc_size((MaxML+1) * sizeof(U32)) + ZSTD_cwksp_aligned64_alloc_size((MaxLL+1) * sizeof(U32)) + ZSTD_cwksp_aligned64_alloc_size((MaxOff+1) * sizeof(U32)) + ZSTD_cwksp_aligned64_alloc_size((1<<Litbits) * sizeof(U32)) + ZSTD_cwksp_aligned64_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)) + ZSTD_cwksp_aligned64_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder) ? ZSTD_cwksp_aligned64_alloc_size(hSize) : 0; size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) ? optPotentialSpace : 0; size_t const slackSpace = ZSTD_cwksp_slack_space_required(); /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */ ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4); assert(useRowMatchFinder != ZSTD_ps_auto); DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u", (U32)chainSize, (U32)hSize, (U32)h3Size); return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; } /* Helper function for calculating memory requirements. * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) { U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4; return blockSize / divider; } static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( const ZSTD_compressionParameters* cParams, const ldmParams_t* ldmParams, const int isStatic, const ZSTD_ParamSwitch_e useRowMatchFinder, const size_t buffInSize, const size_t buffOutSize, const U64 pledgedSrcSize, int useSequenceProducer, size_t maxBlockSize) { size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + ZSTD_cwksp_aligned64_alloc_size(maxNbSeq * sizeof(SeqDef)) + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); size_t const tmpWorkSpace = ZSTD_cwksp_alloc_size(TMP_WORKSPACE_SIZE); size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams); size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize); size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ? ZSTD_cwksp_aligned64_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) + ZSTD_cwksp_alloc_size(buffOutSize); size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); size_t const externalSeqSpace = useSequenceProducer ? ZSTD_cwksp_aligned64_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) : 0; size_t const neededSpace = cctxSpace + tmpWorkSpace + blockStateSpace + ldmSpace + ldmSeqSpace + matchStateSize + tokenSpace + bufferSpace + externalSeqSpace; DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); return neededSpace; } size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) { ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &cParams); RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); /* estimateCCtxSize is for one-shot compression. So no buffers should * be needed. However, we still allocate two 0-sized buffers, which can * take space under ASAN. */ return ZSTD_estimateCCtxSize_usingCCtxParams_internal( &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); } size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) { ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ size_t noRowCCtxSize; size_t rowCCtxSize; initialParams.useRowMatchFinder = ZSTD_ps_disable; noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); initialParams.useRowMatchFinder = ZSTD_ps_enable; rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); return MAX(noRowCCtxSize, rowCCtxSize); } else { return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); } } static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) { int tier = 0; size_t largestSize = 0; static const unsigned long long srcSizeTiers[4] = {16 KB, 128 KB, 256 KB, ZSTD_CONTENTSIZE_UNKNOWN}; for (; tier < 4; ++tier) { /* Choose the set of cParams for a given level across all srcSizes that give the largest cctxSize */ ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeTiers[tier], 0, ZSTD_cpm_noAttachDict); largestSize = MAX(ZSTD_estimateCCtxSize_usingCParams(cParams), largestSize); } return largestSize; } size_t ZSTD_estimateCCtxSize(int compressionLevel) { int level; size_t memBudget = 0; for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) { /* Ensure monotonically increasing memory usage as compression level increases */ size_t const newMB = ZSTD_estimateCCtxSize_internal(level); if (newMB > memBudget) memBudget = newMB; } return memBudget; } size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) { RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); { ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog); size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) ? ((size_t)1 << cParams.windowLog) + blockSize : 0; size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams); return ZSTD_estimateCCtxSize_usingCCtxParams_internal( &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); } } size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams) { ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ size_t noRowCCtxSize; size_t rowCCtxSize; initialParams.useRowMatchFinder = ZSTD_ps_disable; noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); initialParams.useRowMatchFinder = ZSTD_ps_enable; rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); return MAX(noRowCCtxSize, rowCCtxSize); } else { return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); } } static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) { ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); return ZSTD_estimateCStreamSize_usingCParams(cParams); } size_t ZSTD_estimateCStreamSize(int compressionLevel) { int level; size_t memBudget = 0; for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) { size_t const newMB = ZSTD_estimateCStreamSize_internal(level); if (newMB > memBudget) memBudget = newMB; } return memBudget; } /* ZSTD_getFrameProgression(): * tells how much data has been consumed (input) and produced (output) for current frame. * able to count progression inside worker threads (non-blocking mode). */ ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx) { { ZSTD_frameProgression fp; size_t const buffered = (cctx->inBuff == NULL) ? 0 : cctx->inBuffPos - cctx->inToCompress; if (buffered) assert(cctx->inBuffPos >= cctx->inToCompress); assert(buffered <= ZSTD_BLOCKSIZE_MAX); fp.ingested = cctx->consumedSrcSize + buffered; fp.consumed = cctx->consumedSrcSize; fp.produced = cctx->producedCSize; fp.flushed = cctx->producedCSize; /* simplified; some data might still be left within streaming output buffer */ fp.currentJobID = 0; fp.nbActiveWorkers = 0; return fp; } } /*! ZSTD_toFlushNow() * Only useful for multithreading scenarios currently (nbWorkers >= 1). */ size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx) { (void)cctx; return 0; /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */ } static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1, ZSTD_compressionParameters cParams2) { (void)cParams1; (void)cParams2; assert(cParams1.windowLog == cParams2.windowLog); assert(cParams1.chainLog == cParams2.chainLog); assert(cParams1.hashLog == cParams2.hashLog); assert(cParams1.searchLog == cParams2.searchLog); assert(cParams1.minMatch == cParams2.minMatch); assert(cParams1.targetLength == cParams2.targetLength); assert(cParams1.strategy == cParams2.strategy); } void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs) { int i; for (i = 0; i < ZSTD_REP_NUM; ++i) bs->rep[i] = repStartValue[i]; bs->entropy.huf.repeatMode = HUF_repeat_none; bs->entropy.fse.offcode_repeatMode = FSE_repeat_none; bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none; bs->entropy.fse.litlength_repeatMode = FSE_repeat_none; } /*! ZSTD_invalidateMatchState() * Invalidate all the matches in the match finder tables. * Requires nextSrc and base to be set (can be NULL). */ static void ZSTD_invalidateMatchState(ZSTD_MatchState_t* ms) { ZSTD_window_clear(&ms->window); ms->nextToUpdate = ms->window.dictLimit; ms->loadedDictEnd = 0; ms->opt.litLengthSum = 0; /* force reset of btopt stats */ ms->dictMatchState = NULL; } /* * Controls, for this matchState reset, whether the tables need to be cleared / * prepared for the coming compression (ZSTDcrp_makeClean), or whether the * tables can be left unclean (ZSTDcrp_leaveDirty), because we know that a * subsequent operation will overwrite the table space anyways (e.g., copying * the matchState contents in from a CDict). */ typedef enum { ZSTDcrp_makeClean, ZSTDcrp_leaveDirty } ZSTD_compResetPolicy_e; /* * Controls, for this matchState reset, whether indexing can continue where it * left off (ZSTDirp_continue), or whether it needs to be restarted from zero * (ZSTDirp_reset). */ typedef enum { ZSTDirp_continue, ZSTDirp_reset } ZSTD_indexResetPolicy_e; typedef enum { ZSTD_resetTarget_CDict, ZSTD_resetTarget_CCtx } ZSTD_resetTarget_e; /* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ static U64 ZSTD_bitmix(U64 val, U64 len) { val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); val *= 0x9FB21C651E98DF25ULL; val ^= (val >> 35) + len ; val *= 0x9FB21C651E98DF25ULL; return val ^ (val >> 28); } /* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ static void ZSTD_advanceHashSalt(ZSTD_MatchState_t* ms) { ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); } static size_t ZSTD_reset_matchState(ZSTD_MatchState_t* ms, ZSTD_cwksp* ws, const ZSTD_compressionParameters* cParams, const ZSTD_ParamSwitch_e useRowMatchFinder, const ZSTD_compResetPolicy_e crp, const ZSTD_indexResetPolicy_e forceResetIndex, const ZSTD_resetTarget_e forWho) { /* disable chain table allocation for fast or row-based strategies */ size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, ms->dedicatedDictSearch && (forWho == ZSTD_resetTarget_CDict)) ? ((size_t)1 << cParams->chainLog) : 0; size_t const hSize = ((size_t)1) << cParams->hashLog; U32 const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset); assert(useRowMatchFinder != ZSTD_ps_auto); if (forceResetIndex == ZSTDirp_reset) { ZSTD_window_init(&ms->window); ZSTD_cwksp_mark_tables_dirty(ws); } ms->hashLog3 = hashLog3; ms->lazySkipping = 0; ZSTD_invalidateMatchState(ms); assert(!ZSTD_cwksp_reserve_failed(ws)); /* check that allocation hasn't already failed */ ZSTD_cwksp_clear_tables(ws); DEBUGLOG(5, "reserving table space"); /* table Space */ ms->hashTable = (U32*)ZSTD_cwksp_reserve_table(ws, hSize * sizeof(U32)); ms->chainTable = (U32*)ZSTD_cwksp_reserve_table(ws, chainSize * sizeof(U32)); ms->hashTable3 = (U32*)ZSTD_cwksp_reserve_table(ws, h3Size * sizeof(U32)); RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, "failed a workspace allocation in ZSTD_reset_matchState"); DEBUGLOG(4, "reset table : %u", crp!=ZSTDcrp_leaveDirty); if (crp!=ZSTDcrp_leaveDirty) { /* reset tables only */ ZSTD_cwksp_clean_tables(ws); } if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { /* Row match finder needs an additional table of hashes ("tags") */ size_t const tagTableSize = hSize; /* We want to generate a new salt in case we reset a Cctx, but we always want to use * 0 when we reset a Cdict */ if(forWho == ZSTD_resetTarget_CCtx) { ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); ZSTD_advanceHashSalt(ms); } else { /* When we are not salting we want to always memset the memory */ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned64(ws, tagTableSize); ZSTD_memset(ms->tagTable, 0, tagTableSize); ms->hashSalt = 0; } { /* Switch to 32-entry rows if searchLog is 5 (or more) */ U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); assert(cParams->hashLog >= rowLog); ms->rowHashLog = cParams->hashLog - rowLog; } } /* opt parser space */ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { DEBUGLOG(4, "reserving optimal parser space"); ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (1<<Litbits) * sizeof(unsigned)); ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxLL+1) * sizeof(unsigned)); ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxML+1) * sizeof(unsigned)); ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxOff+1) * sizeof(unsigned)); ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); } ms->cParams = *cParams; RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, "failed a workspace allocation in ZSTD_reset_matchState"); return 0; } /* ZSTD_indexTooCloseToMax() : * minor optimization : prefer memset() rather than reduceIndex() * which is measurably slow in some circumstances (reported for Visual Studio). * Works when re-using a context for a lot of smallish inputs : * if all inputs are smaller than ZSTD_INDEXOVERFLOW_MARGIN, * memset() will be triggered before reduceIndex(). */ #define ZSTD_INDEXOVERFLOW_MARGIN (16 MB) static int ZSTD_indexTooCloseToMax(ZSTD_window_t w) { return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN); } /* ZSTD_dictTooBig(): * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be loaded in * one go generically. So we ensure that in that case we reset the tables to zero, * so that we can load as much of the dictionary as possible. */ static int ZSTD_dictTooBig(size_t const loadedDictSize) { return loadedDictSize > ZSTD_CHUNKSIZE_MAX; } /*! ZSTD_resetCCtx_internal() : * @param loadedDictSize The size of the dictionary to be loaded * into the context, if any. If no dictionary is used, or the * dictionary is being attached / copied, then pass 0. * note : `params` are assumed fully validated at this stage. */ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, ZSTD_CCtx_params const* params, U64 const pledgedSrcSize, size_t const loadedDictSize, ZSTD_compResetPolicy_e const crp, ZSTD_buffered_policy_e const zbuff) { ZSTD_cwksp* const ws = &zc->workspace; DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d", (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->postBlockSplitter); assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); zc->isFirstBlock = 1; /* Set applied params early so we can modify them for LDM, * and point params at the applied params. */ zc->appliedParams = *params; params = &zc->appliedParams; assert(params->useRowMatchFinder != ZSTD_ps_auto); assert(params->postBlockSplitter != ZSTD_ps_auto); assert(params->ldmParams.enableLdm != ZSTD_ps_auto); assert(params->maxBlockSize != 0); if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* Adjust long distance matching parameters */ ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); assert(params->ldmParams.hashLog >= params->ldmParams.bucketSizeLog); assert(params->ldmParams.hashRateLog < 32); } { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); size_t const blockSize = MIN(params->maxBlockSize, windowSize); size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params)); size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; size_t const buffInSize = (zbuff == ZSTDb_buffered && params->inBufferMode == ZSTD_bm_buffered) ? windowSize + blockSize : 0; size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize); int const indexTooClose = ZSTD_indexTooCloseToMax(zc->blockState.matchState.window); int const dictTooBig = ZSTD_dictTooBig(loadedDictSize); ZSTD_indexResetPolicy_e needsIndexReset = (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_reset : ZSTDirp_continue; size_t const neededSpace = ZSTD_estimateCCtxSize_usingCCtxParams_internal( ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize); FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0); { /* Check if workspace is large enough, alloc a new one if needed */ int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); int resizeWorkspace = workspaceTooSmall || workspaceWasteful; DEBUGLOG(4, "Need %zu B workspace", neededSpace); DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); if (resizeWorkspace) { DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB", ZSTD_cwksp_sizeof(ws) >> 10, neededSpace >> 10); RETURN_ERROR_IF(zc->staticSize, memory_allocation, "static cctx : no resize"); needsIndexReset = ZSTDirp_reset; ZSTD_cwksp_free(ws, zc->customMem); FORWARD_IF_ERROR(ZSTD_cwksp_create(ws, neededSpace, zc->customMem), ""); DEBUGLOG(5, "reserving object space"); /* Statically sized space. * tmpWorkspace never moves, * though prev/next block swap places */ assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t))); zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock"); zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock"); zc->tmpWorkspace = ZSTD_cwksp_reserve_object(ws, TMP_WORKSPACE_SIZE); RETURN_ERROR_IF(zc->tmpWorkspace == NULL, memory_allocation, "couldn't allocate tmpWorkspace"); zc->tmpWkspSize = TMP_WORKSPACE_SIZE; } } ZSTD_cwksp_clear(ws); /* init params */ zc->blockState.matchState.cParams = params->cParams; zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable; zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; zc->consumedSrcSize = 0; zc->producedCSize = 0; if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN) zc->appliedParams.fParams.contentSizeFlag = 0; DEBUGLOG(4, "pledged content size : %u ; flag : %u", (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag); zc->blockSizeMax = blockSize; xxh64_reset(&zc->xxhState, 0); zc->stage = ZSTDcs_init; zc->dictID = 0; zc->dictContentSize = 0; ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); FORWARD_IF_ERROR(ZSTD_reset_matchState( &zc->blockState.matchState, ws, ¶ms->cParams, params->useRowMatchFinder, crp, needsIndexReset, ZSTD_resetTarget_CCtx), ""); zc->seqStore.sequencesStart = (SeqDef*)ZSTD_cwksp_reserve_aligned64(ws, maxNbSeq * sizeof(SeqDef)); /* ldm hash table */ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* TODO: avoid memset? */ size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned64(ws, ldmHSize * sizeof(ldmEntry_t)); ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned64(ws, maxNbLdmSeq * sizeof(rawSeq)); zc->maxNbLdmSequences = maxNbLdmSeq; ZSTD_window_init(&zc->ldmState.window); zc->ldmState.loadedDictEnd = 0; } /* reserve space for block-level external sequences */ if (ZSTD_hasExtSeqProd(params)) { size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); zc->extSeqBufCapacity = maxNbExternalSeq; zc->extSeqBuf = (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned64(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); } /* buffers */ /* ZSTD_wildcopy() is used to copy into the literals buffer, * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. */ zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); zc->seqStore.maxNbLit = blockSize; zc->bufferedPolicy = zbuff; zc->inBuffSize = buffInSize; zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); zc->outBuffSize = buffOutSize; zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); /* ldm bucketOffsets table */ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* TODO: avoid memset? */ size_t const numBuckets = ((size_t)1) << (params->ldmParams.hashLog - params->ldmParams.bucketSizeLog); zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets); ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); } /* sequences storage */ ZSTD_referenceExternalSequences(zc, NULL, 0); zc->seqStore.maxNbSeq = maxNbSeq; zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); zc->initialized = 1; return 0; } } /* ZSTD_invalidateRepCodes() : * ensures next compression will not use repcodes from previous block. * Note : only works with regular variant; * do not use with extDict variant ! */ void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) { int i; for (i=0; i<ZSTD_REP_NUM; i++) cctx->blockState.prevCBlock->rep[i] = 0; assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window)); } /* These are the approximate sizes for each strategy past which copying the * dictionary tables into the working context is faster than using them * in-place. */ static const size_t attachDictSizeCutoffs[ZSTD_STRATEGY_MAX+1] = { 8 KB, /* unused */ 8 KB, /* ZSTD_fast */ 16 KB, /* ZSTD_dfast */ 32 KB, /* ZSTD_greedy */ 32 KB, /* ZSTD_lazy */ 32 KB, /* ZSTD_lazy2 */ 32 KB, /* ZSTD_btlazy2 */ 32 KB, /* ZSTD_btopt */ 8 KB, /* ZSTD_btultra */ 8 KB /* ZSTD_btultra2 */ }; static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict, const ZSTD_CCtx_params* params, U64 pledgedSrcSize) { size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy]; int const dedicatedDictSearch = cdict->matchState.dedicatedDictSearch; return dedicatedDictSearch || ( ( pledgedSrcSize <= cutoff || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN || params->attachDictPref == ZSTD_dictForceAttach ) && params->attachDictPref != ZSTD_dictForceCopy && !params->forceWindow ); /* dictMatchState isn't correctly * handled in _enforceMaxDist */ } static size_t ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, ZSTD_CCtx_params params, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=%llu", (unsigned long long)pledgedSrcSize); { ZSTD_compressionParameters adjusted_cdict_cParams = cdict->matchState.cParams; unsigned const windowLog = params.cParams.windowLog; assert(windowLog != 0); /* Resize working context table params for input only, since the dict * has its own tables. */ /* pledgedSrcSize == 0 means 0! */ if (cdict->matchState.dedicatedDictSearch) { ZSTD_dedicatedDictSearch_revertCParams(&adjusted_cdict_cParams); } params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, cdict->dictContentSize, ZSTD_cpm_attachDict, params.useRowMatchFinder); params.cParams.windowLog = windowLog; params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, /* loadedDictSize */ 0, ZSTDcrp_makeClean, zbuff), ""); assert(cctx->appliedParams.cParams.strategy == adjusted_cdict_cParams.strategy); } { const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc - cdict->matchState.window.base); const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit; if (cdictLen == 0) { /* don't even attach dictionaries with no contents */ DEBUGLOG(4, "skipping attaching empty dictionary"); } else { DEBUGLOG(4, "attaching dictionary into context"); cctx->blockState.matchState.dictMatchState = &cdict->matchState; /* prep working match state so dict matches never have negative indices * when they are translated to the working context's index space. */ if (cctx->blockState.matchState.window.dictLimit < cdictEnd) { cctx->blockState.matchState.window.nextSrc = cctx->blockState.matchState.window.base + cdictEnd; ZSTD_window_clear(&cctx->blockState.matchState.window); } /* loadedDictEnd is expressed within the referential of the active context */ cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit; } } cctx->dictID = cdict->dictID; cctx->dictContentSize = cdict->dictContentSize; /* copy block state */ ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); return 0; } static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize, ZSTD_compressionParameters const* cParams) { if (ZSTD_CDictIndicesAreTagged(cParams)){ /* Remove tags from the CDict table if they are present. * See docs on "short cache" in zstd_compress_internal.h for context. */ size_t i; for (i = 0; i < tableSize; i++) { U32 const taggedIndex = src[i]; U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS; dst[i] = index; } } else { ZSTD_memcpy(dst, src, tableSize * sizeof(U32)); } } static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, ZSTD_CCtx_params params, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams; assert(!cdict->matchState.dedicatedDictSearch); DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=%llu", (unsigned long long)pledgedSrcSize); { unsigned const windowLog = params.cParams.windowLog; assert(windowLog != 0); /* Copy only compression parameters related to tables. */ params.cParams = *cdict_cParams; params.cParams.windowLog = windowLog; params.useRowMatchFinder = cdict->useRowMatchFinder; FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, /* loadedDictSize */ 0, ZSTDcrp_leaveDirty, zbuff), ""); assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy); assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog); assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog); } ZSTD_cwksp_mark_tables_dirty(&cctx->workspace); assert(params.useRowMatchFinder != ZSTD_ps_auto); /* copy tables */ { size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */) ? ((size_t)1 << cdict_cParams->chainLog) : 0; size_t const hSize = (size_t)1 << cdict_cParams->hashLog; ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable, cdict->matchState.hashTable, hSize, cdict_cParams); /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable, cdict->matchState.chainTable, chainSize, cdict_cParams); } /* copy tag table */ if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { size_t const tagTableSize = hSize; ZSTD_memcpy(cctx->blockState.matchState.tagTable, cdict->matchState.tagTable, tagTableSize); cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; } } /* Zero the hashTable3, since the cdict never fills it */ assert(cctx->blockState.matchState.hashLog3 <= 31); { U32 const h3log = cctx->blockState.matchState.hashLog3; size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; assert(cdict->matchState.hashLog3 == 0); ZSTD_memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32)); } ZSTD_cwksp_mark_tables_clean(&cctx->workspace); /* copy dictionary offsets */ { ZSTD_MatchState_t const* srcMatchState = &cdict->matchState; ZSTD_MatchState_t* dstMatchState = &cctx->blockState.matchState; dstMatchState->window = srcMatchState->window; dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; } cctx->dictID = cdict->dictID; cctx->dictContentSize = cdict->dictContentSize; /* copy block state */ ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); return 0; } /* We have a choice between copying the dictionary context into the working * context, or referencing the dictionary context from the working context * in-place. We decide here which strategy to use. */ static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, const ZSTD_CCtx_params* params, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)", (unsigned)pledgedSrcSize); if (ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) { return ZSTD_resetCCtx_byAttachingCDict( cctx, cdict, *params, pledgedSrcSize, zbuff); } else { return ZSTD_resetCCtx_byCopyingCDict( cctx, cdict, *params, pledgedSrcSize, zbuff); } } /*! ZSTD_copyCCtx_internal() : * Duplicate an existing context `srcCCtx` into another one `dstCCtx`. * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). * The "context", in this case, refers to the hash and chain tables, * entropy tables, and dictionary references. * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx. * @return : 0, or an error code */ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, ZSTD_frameParameters fParams, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong, "Can't copy a ctx that's not in init stage."); DEBUGLOG(5, "ZSTD_copyCCtx_internal"); ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); { ZSTD_CCtx_params params = dstCCtx->requestedParams; /* Copy only compression parameters related to tables. */ params.cParams = srcCCtx->appliedParams.cParams; assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto); assert(srcCCtx->appliedParams.postBlockSplitter != ZSTD_ps_auto); assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto); params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder; params.postBlockSplitter = srcCCtx->appliedParams.postBlockSplitter; params.ldmParams = srcCCtx->appliedParams.ldmParams; params.fParams = fParams; params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, /* loadedDictSize */ 0, ZSTDcrp_leaveDirty, zbuff); assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog); assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy); assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog); assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog); assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3); } ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace); /* copy tables */ { size_t const chainSize = ZSTD_allocateChainTable(srcCCtx->appliedParams.cParams.strategy, srcCCtx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */) ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog) : 0; size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; U32 const h3log = srcCCtx->blockState.matchState.hashLog3; size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable, srcCCtx->blockState.matchState.hashTable, hSize * sizeof(U32)); ZSTD_memcpy(dstCCtx->blockState.matchState.chainTable, srcCCtx->blockState.matchState.chainTable, chainSize * sizeof(U32)); ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable3, srcCCtx->blockState.matchState.hashTable3, h3Size * sizeof(U32)); } ZSTD_cwksp_mark_tables_clean(&dstCCtx->workspace); /* copy dictionary offsets */ { const ZSTD_MatchState_t* srcMatchState = &srcCCtx->blockState.matchState; ZSTD_MatchState_t* dstMatchState = &dstCCtx->blockState.matchState; dstMatchState->window = srcMatchState->window; dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; } dstCCtx->dictID = srcCCtx->dictID; dstCCtx->dictContentSize = srcCCtx->dictContentSize; /* copy block state */ ZSTD_memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock)); return 0; } /*! ZSTD_copyCCtx() : * Duplicate an existing context `srcCCtx` into another one `dstCCtx`. * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). * pledgedSrcSize==0 means "unknown". * @return : 0, or an error code */ size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize) { ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; ZSTD_buffered_policy_e const zbuff = srcCCtx->bufferedPolicy; ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1); if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN); return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx, fParams, pledgedSrcSize, zbuff); } #define ZSTD_ROWSIZE 16 /*! ZSTD_reduceTable() : * reduce table indexes by `reducerValue`, or squash to zero. * PreserveMark preserves "unsorted mark" for btlazy2 strategy. * It must be set to a clear 0/1 value, to remove branch during inlining. * Presume table size is a multiple of ZSTD_ROWSIZE * to help auto-vectorization */ FORCE_INLINE_TEMPLATE void ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark) { int const nbRows = (int)size / ZSTD_ROWSIZE; int cellNb = 0; int rowNb; /* Protect special index values < ZSTD_WINDOW_START_INDEX. */ U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX; assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ assert(size < (1U<<31)); /* can be cast to int */ for (rowNb=0 ; rowNb < nbRows ; rowNb++) { int column; for (column=0; column<ZSTD_ROWSIZE; column++) { U32 newVal; if (preserveMark && table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) { /* This write is pointless, but is required(?) for the compiler * to auto-vectorize the loop. */ newVal = ZSTD_DUBT_UNSORTED_MARK; } else if (table[cellNb] < reducerThreshold) { newVal = 0; } else { newVal = table[cellNb] - reducerValue; } table[cellNb] = newVal; cellNb++; } } } static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue) { ZSTD_reduceTable_internal(table, size, reducerValue, 0); } static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const reducerValue) { ZSTD_reduceTable_internal(table, size, reducerValue, 1); } /*! ZSTD_reduceIndex() : * rescale all indexes to avoid future overflow (indexes are U32) */ static void ZSTD_reduceIndex (ZSTD_MatchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue) { { U32 const hSize = (U32)1 << params->cParams.hashLog; ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); } if (ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMatchFinder, (U32)ms->dedicatedDictSearch)) { U32 const chainSize = (U32)1 << params->cParams.chainLog; if (params->cParams.strategy == ZSTD_btlazy2) ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue); else ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue); } if (ms->hashLog3) { U32 const h3Size = (U32)1 << ms->hashLog3; ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue); } } /*-******************************************************* * Block entropic compression *********************************************************/ /* See doc/zstd_compression_format.md for detailed format description */ int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr) { const SeqDef* const sequences = seqStorePtr->sequencesStart; BYTE* const llCodeTable = seqStorePtr->llCode; BYTE* const ofCodeTable = seqStorePtr->ofCode; BYTE* const mlCodeTable = seqStorePtr->mlCode; U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); U32 u; int longOffsets = 0; assert(nbSeq <= seqStorePtr->maxNbSeq); for (u=0; u<nbSeq; u++) { U32 const llv = sequences[u].litLength; U32 const ofCode = ZSTD_highbit32(sequences[u].offBase); U32 const mlv = sequences[u].mlBase; llCodeTable[u] = (BYTE)ZSTD_LLcode(llv); ofCodeTable[u] = (BYTE)ofCode; mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv); assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN)); if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN) longOffsets = 1; } if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) llCodeTable[seqStorePtr->longLengthPos] = MaxLL; if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) mlCodeTable[seqStorePtr->longLengthPos] = MaxML; return longOffsets; } /* ZSTD_useTargetCBlockSize(): * Returns if target compressed block size param is being used. * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize. * Returns 1 if true, 0 otherwise. */ static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams) { DEBUGLOG(5, "ZSTD_useTargetCBlockSize (targetCBlockSize=%zu)", cctxParams->targetCBlockSize); return (cctxParams->targetCBlockSize != 0); } /* ZSTD_blockSplitterEnabled(): * Returns if block splitting param is being used * If used, compression will do best effort to split a block in order to improve compression ratio. * At the time this function is called, the parameter must be finalized. * Returns 1 if true, 0 otherwise. */ static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) { DEBUGLOG(5, "ZSTD_blockSplitterEnabled (postBlockSplitter=%d)", cctxParams->postBlockSplitter); assert(cctxParams->postBlockSplitter != ZSTD_ps_auto); return (cctxParams->postBlockSplitter == ZSTD_ps_enable); } /* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types * and size of the sequences statistics */ typedef struct { U32 LLtype; U32 Offtype; U32 MLtype; size_t size; size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ int longOffsets; } ZSTD_symbolEncodingTypeStats_t; /* ZSTD_buildSequencesStatistics(): * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `size` field. * Modifies `nextEntropy` to have the appropriate values as a side effect. * nbSeq must be greater than 0. * * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) */ static ZSTD_symbolEncodingTypeStats_t ZSTD_buildSequencesStatistics( const SeqStore_t* seqStorePtr, size_t nbSeq, const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, BYTE* dst, const BYTE* const dstEnd, ZSTD_strategy strategy, unsigned* countWorkspace, void* entropyWorkspace, size_t entropyWkspSize) { BYTE* const ostart = dst; const BYTE* const oend = dstEnd; BYTE* op = ostart; FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; const BYTE* const ofCodeTable = seqStorePtr->ofCode; const BYTE* const llCodeTable = seqStorePtr->llCode; const BYTE* const mlCodeTable = seqStorePtr->mlCode; ZSTD_symbolEncodingTypeStats_t stats; stats.lastCountSize = 0; /* convert length/distances into codes */ stats.longOffsets = ZSTD_seqToCodes(seqStorePtr); assert(op <= oend); assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ /* build CTable for Literal Lengths */ { unsigned max = MaxLL; size_t const mostFrequent = HIST_countFast_wksp(countWorkspace, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ DEBUGLOG(5, "Building LL table"); nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; stats.LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, countWorkspace, max, mostFrequent, nbSeq, LLFSELog, prevEntropy->litlengthCTable, LL_defaultNorm, LL_defaultNormLog, ZSTD_defaultAllowed, strategy); assert(set_basic < set_compressed && set_rle < set_compressed); assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ { size_t const countSize = ZSTD_buildCTable( op, (size_t)(oend - op), CTable_LitLength, LLFSELog, (SymbolEncodingType_e)stats.LLtype, countWorkspace, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL, prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable), entropyWorkspace, entropyWkspSize); if (ZSTD_isError(countSize)) { DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed"); stats.size = countSize; return stats; } if (stats.LLtype == set_compressed) stats.lastCountSize = countSize; op += countSize; assert(op <= oend); } } /* build CTable for Offsets */ { unsigned max = MaxOff; size_t const mostFrequent = HIST_countFast_wksp( countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ ZSTD_DefaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; DEBUGLOG(5, "Building OF table"); nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, countWorkspace, max, mostFrequent, nbSeq, OffFSELog, prevEntropy->offcodeCTable, OF_defaultNorm, OF_defaultNormLog, defaultPolicy, strategy); assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ { size_t const countSize = ZSTD_buildCTable( op, (size_t)(oend - op), CTable_OffsetBits, OffFSELog, (SymbolEncodingType_e)stats.Offtype, countWorkspace, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable), entropyWorkspace, entropyWkspSize); if (ZSTD_isError(countSize)) { DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed"); stats.size = countSize; return stats; } if (stats.Offtype == set_compressed) stats.lastCountSize = countSize; op += countSize; assert(op <= oend); } } /* build CTable for MatchLengths */ { unsigned max = MaxML; size_t const mostFrequent = HIST_countFast_wksp( countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; stats.MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, countWorkspace, max, mostFrequent, nbSeq, MLFSELog, prevEntropy->matchlengthCTable, ML_defaultNorm, ML_defaultNormLog, ZSTD_defaultAllowed, strategy); assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ { size_t const countSize = ZSTD_buildCTable( op, (size_t)(oend - op), CTable_MatchLength, MLFSELog, (SymbolEncodingType_e)stats.MLtype, countWorkspace, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML, prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable), entropyWorkspace, entropyWkspSize); if (ZSTD_isError(countSize)) { DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed"); stats.size = countSize; return stats; } if (stats.MLtype == set_compressed) stats.lastCountSize = countSize; op += countSize; assert(op <= oend); } } stats.size = (size_t)(op-ostart); return stats; } /* ZSTD_entropyCompressSeqStore_internal(): * compresses both literals and sequences * Returns compressed size of block, or a zstd error. */ #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 MEM_STATIC size_t ZSTD_entropyCompressSeqStore_internal( void* dst, size_t dstCapacity, const void* literals, size_t litSize, const SeqStore_t* seqStorePtr, const ZSTD_entropyCTables_t* prevEntropy, ZSTD_entropyCTables_t* nextEntropy, const ZSTD_CCtx_params* cctxParams, void* entropyWorkspace, size_t entropyWkspSize, const int bmi2) { ZSTD_strategy const strategy = cctxParams->cParams.strategy; unsigned* count = (unsigned*)entropyWorkspace; FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; const SeqDef* const sequences = seqStorePtr->sequencesStart; const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); const BYTE* const ofCodeTable = seqStorePtr->ofCode; const BYTE* const llCodeTable = seqStorePtr->llCode; const BYTE* const mlCodeTable = seqStorePtr->mlCode; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstCapacity; BYTE* op = ostart; size_t lastCountSize; int longOffsets = 0; entropyWorkspace = count + (MaxSeq + 1); entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity); ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog))); assert(entropyWkspSize >= HUF_WORKSPACE_SIZE); /* Compress literals */ { size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); /* Base suspicion of uncompressibility on ratio of literals to sequences */ int const suspectUncompressible = (numSequences == 0) || (litSize / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); size_t const cSize = ZSTD_compressLiterals( op, dstCapacity, literals, litSize, entropyWorkspace, entropyWkspSize, &prevEntropy->huf, &nextEntropy->huf, cctxParams->cParams.strategy, ZSTD_literalsCompressionIsDisabled(cctxParams), suspectUncompressible, bmi2); FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); assert(cSize <= dstCapacity); op += cSize; } /* Sequences Header */ RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, dstSize_tooSmall, "Can't fit seq hdr in output buf!"); if (nbSeq < 128) { *op++ = (BYTE)nbSeq; } else if (nbSeq < LONGNBSEQ) { op[0] = (BYTE)((nbSeq>>8) + 0x80); op[1] = (BYTE)nbSeq; op+=2; } else { op[0]=0xFF; MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)); op+=3; } assert(op <= oend); if (nbSeq==0) { /* Copy the old tables over as if we repeated them */ ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); return (size_t)(op - ostart); } { BYTE* const seqHead = op++; /* build stats for sequences */ const ZSTD_symbolEncodingTypeStats_t stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, &prevEntropy->fse, &nextEntropy->fse, op, oend, strategy, count, entropyWorkspace, entropyWkspSize); FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); lastCountSize = stats.lastCountSize; op += stats.size; longOffsets = stats.longOffsets; } { size_t const bitstreamSize = ZSTD_encodeSequences( op, (size_t)(oend - op), CTable_MatchLength, mlCodeTable, CTable_OffsetBits, ofCodeTable, CTable_LitLength, llCodeTable, sequences, nbSeq, longOffsets, bmi2); FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed"); op += bitstreamSize; assert(op <= oend); /* zstd versions <= 1.3.4 mistakenly report corruption when * FSE_readNCount() receives a buffer < 4 bytes. * Fixed by https://github.com/facebook/zstd/pull/1146. * This can happen when the last set_compressed table present is 2 * bytes and the bitstream is only one byte. * In this exceedingly rare case, we will simply emit an uncompressed * block, since it isn't worth optimizing. */ if (lastCountSize && (lastCountSize + bitstreamSize) < 4) { /* lastCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ assert(lastCountSize + bitstreamSize == 3); DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by " "emitting an uncompressed block."); return 0; } } DEBUGLOG(5, "compressed block size : %u", (unsigned)(op - ostart)); return (size_t)(op - ostart); } static size_t ZSTD_entropyCompressSeqStore_wExtLitBuffer( void* dst, size_t dstCapacity, const void* literals, size_t litSize, size_t blockSize, const SeqStore_t* seqStorePtr, const ZSTD_entropyCTables_t* prevEntropy, ZSTD_entropyCTables_t* nextEntropy, const ZSTD_CCtx_params* cctxParams, void* entropyWorkspace, size_t entropyWkspSize, int bmi2) { size_t const cSize = ZSTD_entropyCompressSeqStore_internal( dst, dstCapacity, literals, litSize, seqStorePtr, prevEntropy, nextEntropy, cctxParams, entropyWorkspace, entropyWkspSize, bmi2); if (cSize == 0) return 0; /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. */ if ((cSize == ERROR(dstSize_tooSmall)) & (blockSize <= dstCapacity)) { DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); return 0; /* block not compressed */ } FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); /* Check compressibility */ { size_t const maxCSize = blockSize - ZSTD_minGain(blockSize, cctxParams->cParams.strategy); if (cSize >= maxCSize) return 0; /* block not compressed */ } DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly. * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above. */ assert(cSize < ZSTD_BLOCKSIZE_MAX); return cSize; } static size_t ZSTD_entropyCompressSeqStore( const SeqStore_t* seqStorePtr, const ZSTD_entropyCTables_t* prevEntropy, ZSTD_entropyCTables_t* nextEntropy, const ZSTD_CCtx_params* cctxParams, void* dst, size_t dstCapacity, size_t srcSize, void* entropyWorkspace, size_t entropyWkspSize, int bmi2) { return ZSTD_entropyCompressSeqStore_wExtLitBuffer( dst, dstCapacity, seqStorePtr->litStart, (size_t)(seqStorePtr->lit - seqStorePtr->litStart), srcSize, seqStorePtr, prevEntropy, nextEntropy, cctxParams, entropyWorkspace, entropyWkspSize, bmi2); } /* ZSTD_selectBlockCompressor() : * Not static, but internal use only (used by long distance matcher) * assumption : strat is a valid strategy */ ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) { static const ZSTD_BlockCompressor_f blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { { ZSTD_compressBlock_fast /* default for 0 */, ZSTD_compressBlock_fast, ZSTD_COMPRESSBLOCK_DOUBLEFAST, ZSTD_COMPRESSBLOCK_GREEDY, ZSTD_COMPRESSBLOCK_LAZY, ZSTD_COMPRESSBLOCK_LAZY2, ZSTD_COMPRESSBLOCK_BTLAZY2, ZSTD_COMPRESSBLOCK_BTOPT, ZSTD_COMPRESSBLOCK_BTULTRA, ZSTD_COMPRESSBLOCK_BTULTRA2 }, { ZSTD_compressBlock_fast_extDict /* default for 0 */, ZSTD_compressBlock_fast_extDict, ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT, ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT, ZSTD_COMPRESSBLOCK_LAZY_EXTDICT, ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT, ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT, ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT, ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT, ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT }, { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, ZSTD_compressBlock_fast_dictMatchState, ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE, ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE, ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE, ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE, ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE, ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE, ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE, ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE }, { NULL /* default for 0 */, NULL, NULL, ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH, ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH, ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH, NULL, NULL, NULL, NULL } }; ZSTD_BlockCompressor_f selectedCompressor; ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); DEBUGLOG(5, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { static const ZSTD_BlockCompressor_f rowBasedBlockCompressors[4][3] = { { ZSTD_COMPRESSBLOCK_GREEDY_ROW, ZSTD_COMPRESSBLOCK_LAZY_ROW, ZSTD_COMPRESSBLOCK_LAZY2_ROW }, { ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW, ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW, ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW }, { ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW, ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW, ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW }, { ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW, ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW, ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW } }; DEBUGLOG(5, "Selecting a row-based matchfinder"); assert(useRowMatchFinder != ZSTD_ps_auto); selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy]; } else { selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; } assert(selectedCompressor != NULL); return selectedCompressor; } static void ZSTD_storeLastLiterals(SeqStore_t* seqStorePtr, const BYTE* anchor, size_t lastLLSize) { ZSTD_memcpy(seqStorePtr->lit, anchor, lastLLSize); seqStorePtr->lit += lastLLSize; } void ZSTD_resetSeqStore(SeqStore_t* ssPtr) { ssPtr->lit = ssPtr->litStart; ssPtr->sequences = ssPtr->sequencesStart; ssPtr->longLengthType = ZSTD_llt_none; } /* ZSTD_postProcessSequenceProducerResult() : * Validates and post-processes sequences obtained through the external matchfinder API: * - Checks whether nbExternalSeqs represents an error condition. * - Appends a block delimiter to outSeqs if one is not already present. * See zstd.h for context regarding block delimiters. * Returns the number of sequences after post-processing, or an error code. */ static size_t ZSTD_postProcessSequenceProducerResult( ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize ) { RETURN_ERROR_IF( nbExternalSeqs > outSeqsCapacity, sequenceProducer_failed, "External sequence producer returned error code %lu", (unsigned long)nbExternalSeqs ); RETURN_ERROR_IF( nbExternalSeqs == 0 && srcSize > 0, sequenceProducer_failed, "Got zero sequences from external sequence producer for a non-empty src buffer!" ); if (srcSize == 0) { ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); return 1; } { ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; /* We can return early if lastSeq is already a block delimiter. */ if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { return nbExternalSeqs; } /* This error condition is only possible if the external matchfinder * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ RETURN_ERROR_IF( nbExternalSeqs == outSeqsCapacity, sequenceProducer_failed, "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" ); /* lastSeq is not a block delimiter, so we need to append one. */ ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); return nbExternalSeqs + 1; } } /* ZSTD_fastSequenceLengthSum() : * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*. * Similar to another function in zstd_compress.c (determine_blockSize), * except it doesn't check for a block delimiter to end summation. * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P). * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */ static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) { size_t matchLenSum, litLenSum, i; matchLenSum = 0; litLenSum = 0; for (i = 0; i < seqBufSize; i++) { litLenSum += seqBuf[i].litLength; matchLenSum += seqBuf[i].matchLength; } return litLenSum + matchLenSum; } /* * Function to validate sequences produced by a block compressor. */ static void ZSTD_validateSeqStore(const SeqStore_t* seqStore, const ZSTD_compressionParameters* cParams) { #if DEBUGLEVEL >= 1 const SeqDef* seq = seqStore->sequencesStart; const SeqDef* const seqEnd = seqStore->sequences; size_t const matchLenLowerBound = cParams->minMatch == 3 ? 3 : 4; for (; seq < seqEnd; ++seq) { const ZSTD_SequenceLength seqLength = ZSTD_getSequenceLength(seqStore, seq); assert(seqLength.matchLength >= matchLenLowerBound); (void)seqLength; (void)matchLenLowerBound; } #else (void)seqStore; (void)cParams; #endif } static size_t ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, ZSTD_SequencePosition* seqPos, const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, const void* src, size_t blockSize, ZSTD_ParamSwitch_e externalRepSearch); typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_BuildSeqStore_e; static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) { ZSTD_MatchState_t* const ms = &zc->blockState.matchState; DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize); assert(srcSize <= ZSTD_BLOCKSIZE_MAX); /* Assert that we have correctly flushed the ctx params into the ms's copy */ ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding * additional 1. We need to revisit and change this logic to be more consistent */ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); } else { ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.minMatch); } return ZSTDbss_noCompress; /* don't even attempt compression below a certain srcSize */ } ZSTD_resetSeqStore(&(zc->seqStore)); /* required for optimal parser to read stats from dictionary */ ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy; /* tell the optimal parser how we expect to compress literals */ ms->opt.literalCompressionMode = zc->appliedParams.literalCompressionMode; /* a gap between an attached dict and the current window is not safe, * they must remain adjacent, * and when that stops being the case, the dict must be unset */ assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit); /* limited update after a very long match */ { const BYTE* const base = ms->window.base; const BYTE* const istart = (const BYTE*)src; const U32 curr = (U32)(istart-base); if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1)); /* ensure no overflow */ if (curr > ms->nextToUpdate + 384) ms->nextToUpdate = curr - MIN(192, (U32)(curr - ms->nextToUpdate - 384)); } /* select and store sequences */ { ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms); size_t lastLLSize; { int i; for (i = 0; i < ZSTD_REP_NUM; ++i) zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i]; } if (zc->externSeqStore.pos < zc->externSeqStore.size) { assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); /* External matchfinder + LDM is technically possible, just not implemented yet. * We need to revisit soon and implement it. */ RETURN_ERROR_IF( ZSTD_hasExtSeqProd(&zc->appliedParams), parameter_combination_unsupported, "Long-distance matching with external sequence producer enabled is not currently supported." ); /* Updates ldmSeqStore.pos */ lastLLSize = ZSTD_ldm_blockCompress(&zc->externSeqStore, ms, &zc->seqStore, zc->blockState.nextCBlock->rep, zc->appliedParams.useRowMatchFinder, src, srcSize); assert(zc->externSeqStore.pos <= zc->externSeqStore.size); } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { RawSeqStore_t ldmSeqStore = kNullRawSeqStore; /* External matchfinder + LDM is technically possible, just not implemented yet. * We need to revisit soon and implement it. */ RETURN_ERROR_IF( ZSTD_hasExtSeqProd(&zc->appliedParams), parameter_combination_unsupported, "Long-distance matching with external sequence producer enabled is not currently supported." ); ldmSeqStore.seq = zc->ldmSequences; ldmSeqStore.capacity = zc->maxNbLdmSequences; /* Updates ldmSeqStore.size */ FORWARD_IF_ERROR(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore, &zc->appliedParams.ldmParams, src, srcSize), ""); /* Updates ldmSeqStore.pos */ lastLLSize = ZSTD_ldm_blockCompress(&ldmSeqStore, ms, &zc->seqStore, zc->blockState.nextCBlock->rep, zc->appliedParams.useRowMatchFinder, src, srcSize); assert(ldmSeqStore.pos == ldmSeqStore.size); } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) { assert( zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize) ); assert(zc->appliedParams.extSeqProdFunc != NULL); { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)( zc->appliedParams.extSeqProdState, zc->extSeqBuf, zc->extSeqBufCapacity, src, srcSize, NULL, 0, /* dict and dictSize, currently not supported */ zc->appliedParams.compressionLevel, windowSize ); size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( zc->extSeqBuf, nbExternalSeqs, zc->extSeqBufCapacity, srcSize ); /* Return early if there is no error, since we don't need to worry about last literals */ if (!ZSTD_isError(nbPostProcessedSeqs)) { ZSTD_SequencePosition seqPos = {0,0,0}; size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs); RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); FORWARD_IF_ERROR( ZSTD_transferSequences_wBlockDelim( zc, &seqPos, zc->extSeqBuf, nbPostProcessedSeqs, src, srcSize, zc->appliedParams.searchForExternalRepcodes ), "Failed to copy external sequences to seqStore!" ); ms->ldmSeqStore = NULL; DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs); return ZSTDbss_compress; } /* Propagate the error if fallback is disabled */ if (!zc->appliedParams.enableMatchFinderFallback) { return nbPostProcessedSeqs; } /* Fallback to software matchfinder */ { ZSTD_BlockCompressor_f const blockCompressor = ZSTD_selectBlockCompressor( zc->appliedParams.cParams.strategy, zc->appliedParams.useRowMatchFinder, dictMode); ms->ldmSeqStore = NULL; DEBUGLOG( 5, "External sequence producer returned error code %lu. Falling back to internal parser.", (unsigned long)nbExternalSeqs ); lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); } } } else { /* not long range mode and no external matchfinder */ ZSTD_BlockCompressor_f const blockCompressor = ZSTD_selectBlockCompressor( zc->appliedParams.cParams.strategy, zc->appliedParams.useRowMatchFinder, dictMode); ms->ldmSeqStore = NULL; lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); } { const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize; ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize); } } ZSTD_validateSeqStore(&zc->seqStore, &zc->appliedParams.cParams); return ZSTDbss_compress; } static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const SeqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) { const SeqDef* inSeqs = seqStore->sequencesStart; const size_t nbInSequences = (size_t)(seqStore->sequences - inSeqs); const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; const size_t nbOutSequences = nbInSequences + 1; size_t nbOutLiterals = 0; Repcodes_t repcodes; size_t i; /* Bounds check that we have enough space for every input sequence * and the block delimiter */ assert(seqCollector->seqIndex <= seqCollector->maxSequences); RETURN_ERROR_IF( nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex), dstSize_tooSmall, "Not enough space to copy sequences"); ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes)); for (i = 0; i < nbInSequences; ++i) { U32 rawOffset; outSeqs[i].litLength = inSeqs[i].litLength; outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH; outSeqs[i].rep = 0; /* Handle the possible single length >= 64K * There can only be one because we add MINMATCH to every match length, * and blocks are at most 128K. */ if (i == seqStore->longLengthPos) { if (seqStore->longLengthType == ZSTD_llt_literalLength) { outSeqs[i].litLength += 0x10000; } else if (seqStore->longLengthType == ZSTD_llt_matchLength) { outSeqs[i].matchLength += 0x10000; } } /* Determine the raw offset given the offBase, which may be a repcode. */ if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) { const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase); assert(repcode > 0); outSeqs[i].rep = repcode; if (outSeqs[i].litLength != 0) { rawOffset = repcodes.rep[repcode - 1]; } else { if (repcode == 3) { assert(repcodes.rep[0] > 1); rawOffset = repcodes.rep[0] - 1; } else { rawOffset = repcodes.rep[repcode]; } } } else { rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase); } outSeqs[i].offset = rawOffset; /* Update repcode history for the sequence */ ZSTD_updateRep(repcodes.rep, inSeqs[i].offBase, inSeqs[i].litLength == 0); nbOutLiterals += outSeqs[i].litLength; } /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker * for the block boundary, according to the API. */ assert(nbInLiterals >= nbOutLiterals); { const size_t lastLLSize = nbInLiterals - nbOutLiterals; outSeqs[nbInSequences].litLength = (U32)lastLLSize; outSeqs[nbInSequences].matchLength = 0; outSeqs[nbInSequences].offset = 0; assert(nbOutSequences == nbInSequences + 1); } seqCollector->seqIndex += nbOutSequences; assert(seqCollector->seqIndex <= seqCollector->maxSequences); return 0; } size_t ZSTD_sequenceBound(size_t srcSize) { const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1; const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1; return maxNbSeq + maxNbDelims; } size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, size_t outSeqsSize, const void* src, size_t srcSize) { const size_t dstCapacity = ZSTD_compressBound(srcSize); void* dst; /* Make C90 happy. */ SeqCollector seqCollector; { int targetCBlockSize; FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), ""); RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0"); } { int nbWorkers; FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), ""); RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); } dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); seqCollector.collectSequences = 1; seqCollector.seqStart = outSeqs; seqCollector.seqIndex = 0; seqCollector.maxSequences = outSeqsSize; zc->seqCollector = seqCollector; { const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); ZSTD_customFree(dst, ZSTD_defaultCMem); FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed"); } assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize)); return zc->seqCollector.seqIndex; } size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize) { size_t in = 0; size_t out = 0; for (; in < seqsSize; ++in) { if (sequences[in].offset == 0 && sequences[in].matchLength == 0) { if (in != seqsSize - 1) { sequences[in+1].litLength += sequences[in].litLength; } } else { sequences[out] = sequences[in]; ++out; } } return out; } /* Unrolled loop to read four size_ts of input at a time. Returns 1 if is RLE, 0 if not. */ static int ZSTD_isRLE(const BYTE* src, size_t length) { const BYTE* ip = src; const BYTE value = ip[0]; const size_t valueST = (size_t)((U64)value * 0x0101010101010101ULL); const size_t unrollSize = sizeof(size_t) * 4; const size_t unrollMask = unrollSize - 1; const size_t prefixLength = length & unrollMask; size_t i; if (length == 1) return 1; /* Check if prefix is RLE first before using unrolled loop */ if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) { return 0; } for (i = prefixLength; i != length; i += unrollSize) { size_t u; for (u = 0; u < unrollSize; u += sizeof(size_t)) { if (MEM_readST(ip + i + u) != valueST) { return 0; } } } return 1; } /* Returns true if the given block may be RLE. * This is just a heuristic based on the compressibility. * It may return both false positives and false negatives. */ static int ZSTD_maybeRLE(SeqStore_t const* seqStore) { size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart); return nbSeqs < 4 && nbLits < 10; } static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) { ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; bs->prevCBlock = bs->nextCBlock; bs->nextCBlock = tmp; } /* Writes the block header */ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { U32 const cBlockHeader = cSize == 1 ? lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); MEM_writeLE24(op, cBlockHeader); DEBUGLOG(5, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); } /* ZSTD_buildBlockEntropyStats_literals() : * Builds entropy for the literals. * Stores literals block type (raw, rle, compressed, repeat) and * huffman description table to hufMetadata. * Requires ENTROPY_WORKSPACE_SIZE workspace * @return : size of huffman description table, or an error code */ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, const ZSTD_hufCTables_t* prevHuf, ZSTD_hufCTables_t* nextHuf, ZSTD_hufCTablesMetadata_t* hufMetadata, const int literalsCompressionIsDisabled, void* workspace, size_t wkspSize, int hufFlags) { BYTE* const wkspStart = (BYTE*)workspace; BYTE* const wkspEnd = wkspStart + wkspSize; BYTE* const countWkspStart = wkspStart; unsigned* const countWksp = (unsigned*)workspace; const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); BYTE* const nodeWksp = countWkspStart + countWkspSize; const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp); unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; unsigned huffLog = LitHufLog; HUF_repeat repeat = prevHuf->repeatMode; DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); /* Prepare nextEntropy assuming reusing the existing table */ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); if (literalsCompressionIsDisabled) { DEBUGLOG(5, "set_basic - disabled"); hufMetadata->hType = set_basic; return 0; } /* small ? don't even attempt compression (speed opt) */ #ifndef COMPRESS_LITERALS_SIZE_MIN # define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */ #endif { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; if (srcSize <= minLitSize) { DEBUGLOG(5, "set_basic - too small"); hufMetadata->hType = set_basic; return 0; } } /* Scan input and build symbol stats */ { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); if (largest == srcSize) { /* only one literal symbol */ DEBUGLOG(5, "set_rle"); hufMetadata->hType = set_rle; return 0; } if (largest <= (srcSize >> 7)+4) { /* heuristic: likely not compressible */ DEBUGLOG(5, "set_basic - no gain"); hufMetadata->hType = set_basic; return 0; } } /* Validate the previous Huffman table */ if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { repeat = HUF_repeat_none; } /* Build Huffman Tree */ ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags); assert(huffLog <= LitHufLog); { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue, huffLog, nodeWksp, nodeWkspSize); FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); huffLog = (U32)maxBits; } { /* Build and write the CTable */ size_t const newCSize = HUF_estimateCompressedSize( (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); size_t const hSize = HUF_writeCTable_wksp( hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, nodeWksp, nodeWkspSize); /* Check against repeating the previous CTable */ if (repeat != HUF_repeat_none) { size_t const oldCSize = HUF_estimateCompressedSize( (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { DEBUGLOG(5, "set_repeat - smaller"); ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); hufMetadata->hType = set_repeat; return 0; } } if (newCSize + hSize >= srcSize) { DEBUGLOG(5, "set_basic - no gains"); ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); hufMetadata->hType = set_basic; return 0; } DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); hufMetadata->hType = set_compressed; nextHuf->repeatMode = HUF_repeat_check; return hSize; } } /* ZSTD_buildDummySequencesStatistics(): * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set_basic, * and updates nextEntropy to the appropriate repeatMode. */ static ZSTD_symbolEncodingTypeStats_t ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0}; nextEntropy->litlength_repeatMode = FSE_repeat_none; nextEntropy->offcode_repeatMode = FSE_repeat_none; nextEntropy->matchlength_repeatMode = FSE_repeat_none; return stats; } /* ZSTD_buildBlockEntropyStats_sequences() : * Builds entropy for the sequences. * Stores symbol compression modes and fse table to fseMetadata. * Requires ENTROPY_WORKSPACE_SIZE wksp. * @return : size of fse tables or error code */ static size_t ZSTD_buildBlockEntropyStats_sequences( const SeqStore_t* seqStorePtr, const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, const ZSTD_CCtx_params* cctxParams, ZSTD_fseCTablesMetadata_t* fseMetadata, void* workspace, size_t wkspSize) { ZSTD_strategy const strategy = cctxParams->cParams.strategy; size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); BYTE* const ostart = fseMetadata->fseTablesBuffer; BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); BYTE* op = ostart; unsigned* countWorkspace = (unsigned*)workspace; unsigned* entropyWorkspace = countWorkspace + (MaxSeq + 1); size_t entropyWorkspaceSize = wkspSize - (MaxSeq + 1) * sizeof(*countWorkspace); ZSTD_symbolEncodingTypeStats_t stats; DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=%zu)", nbSeq); stats = nbSeq != 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, prevEntropy, nextEntropy, op, oend, strategy, countWorkspace, entropyWorkspace, entropyWorkspaceSize) : ZSTD_buildDummySequencesStatistics(nextEntropy); FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); fseMetadata->llType = (SymbolEncodingType_e) stats.LLtype; fseMetadata->ofType = (SymbolEncodingType_e) stats.Offtype; fseMetadata->mlType = (SymbolEncodingType_e) stats.MLtype; fseMetadata->lastCountSize = stats.lastCountSize; return stats.size; } /* ZSTD_buildBlockEntropyStats() : * Builds entropy for the block. * Requires workspace size ENTROPY_WORKSPACE_SIZE * @return : 0 on success, or an error code * Note : also employed in superblock */ size_t ZSTD_buildBlockEntropyStats( const SeqStore_t* seqStorePtr, const ZSTD_entropyCTables_t* prevEntropy, ZSTD_entropyCTables_t* nextEntropy, const ZSTD_CCtx_params* cctxParams, ZSTD_entropyCTablesMetadata_t* entropyMetadata, void* workspace, size_t wkspSize) { size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD); int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0; entropyMetadata->hufMetadata.hufDesSize = ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, &prevEntropy->huf, &nextEntropy->huf, &entropyMetadata->hufMetadata, ZSTD_literalsCompressionIsDisabled(cctxParams), workspace, wkspSize, hufFlags); FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); entropyMetadata->fseMetadata.fseTablesSize = ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, &prevEntropy->fse, &nextEntropy->fse, cctxParams, &entropyMetadata->fseMetadata, workspace, wkspSize); FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildBlockEntropyStats_sequences failed"); return 0; } /* Returns the size estimate for the literals section (header + content) of a block */ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, const ZSTD_hufCTables_t* huf, const ZSTD_hufCTablesMetadata_t* hufMetadata, void* workspace, size_t wkspSize, int writeEntropy) { unsigned* const countWksp = (unsigned*)workspace; unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; size_t literalSectionHeaderSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB); U32 singleStream = litSize < 256; if (hufMetadata->hType == set_basic) return litSize; else if (hufMetadata->hType == set_rle) return 1; else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize); if (ZSTD_isError(largest)) return litSize; { size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue); if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize; if (!singleStream) cLitSizeEstimate += 6; /* multi-stream huffman uses 6-byte jump table */ return cLitSizeEstimate + literalSectionHeaderSize; } } assert(0); /* impossible */ return 0; } /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ static size_t ZSTD_estimateBlockSize_symbolType(SymbolEncodingType_e type, const BYTE* codeTable, size_t nbSeq, unsigned maxCode, const FSE_CTable* fseCTable, const U8* additionalBits, short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, void* workspace, size_t wkspSize) { unsigned* const countWksp = (unsigned*)workspace; const BYTE* ctp = codeTable; const BYTE* const ctStart = ctp; const BYTE* const ctEnd = ctStart + nbSeq; size_t cSymbolTypeSizeEstimateInBits = 0; unsigned max = maxCode; HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */ if (type == set_basic) { /* We selected this encoding type, so it must be valid. */ assert(max <= defaultMax); (void)defaultMax; cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max); } else if (type == set_rle) { cSymbolTypeSizeEstimateInBits = 0; } else if (type == set_compressed || type == set_repeat) { cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max); } if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) { return nbSeq * 10; } while (ctp < ctEnd) { if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp]; else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */ ctp++; } return cSymbolTypeSizeEstimateInBits >> 3; } /* Returns the size estimate for the sequences section (header + content) of a block */ static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, const BYTE* llCodeTable, const BYTE* mlCodeTable, size_t nbSeq, const ZSTD_fseCTables_t* fseTables, const ZSTD_fseCTablesMetadata_t* fseMetadata, void* workspace, size_t wkspSize, int writeEntropy) { size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); size_t cSeqSizeEstimate = 0; cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, fseTables->offcodeCTable, NULL, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, workspace, wkspSize); cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, fseTables->litlengthCTable, LL_bits, LL_defaultNorm, LL_defaultNormLog, MaxLL, workspace, wkspSize); cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, fseTables->matchlengthCTable, ML_bits, ML_defaultNorm, ML_defaultNormLog, MaxML, workspace, wkspSize); if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; return cSeqSizeEstimate + sequencesSectionHeaderSize; } /* Returns the size estimate for a given stream of literals, of, ll, ml */ static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, const BYTE* ofCodeTable, const BYTE* llCodeTable, const BYTE* mlCodeTable, size_t nbSeq, const ZSTD_entropyCTables_t* entropy, const ZSTD_entropyCTablesMetadata_t* entropyMetadata, void* workspace, size_t wkspSize, int writeLitEntropy, int writeSeqEntropy) { size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, &entropy->huf, &entropyMetadata->hufMetadata, workspace, wkspSize, writeLitEntropy); size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, workspace, wkspSize, writeSeqEntropy); return seqSize + literalsSize + ZSTD_blockHeaderSize; } /* Builds entropy statistics and uses them for blocksize estimation. * * @return: estimated compressed size of the seqStore, or a zstd error. */ static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(SeqStore_t* seqStore, ZSTD_CCtx* zc) { ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, &zc->appliedParams, entropyMetadata, zc->tmpWorkspace, zc->tmpWkspSize), ""); return ZSTD_estimateBlockSize( seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), seqStore->ofCode, seqStore->llCode, seqStore->mlCode, (size_t)(seqStore->sequences - seqStore->sequencesStart), &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->tmpWorkspace, zc->tmpWkspSize, (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); } /* Returns literals bytes represented in a seqStore */ static size_t ZSTD_countSeqStoreLiteralsBytes(const SeqStore_t* const seqStore) { size_t literalsBytes = 0; size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); size_t i; for (i = 0; i < nbSeqs; ++i) { SeqDef const seq = seqStore->sequencesStart[i]; literalsBytes += seq.litLength; if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { literalsBytes += 0x10000; } } return literalsBytes; } /* Returns match bytes represented in a seqStore */ static size_t ZSTD_countSeqStoreMatchBytes(const SeqStore_t* const seqStore) { size_t matchBytes = 0; size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); size_t i; for (i = 0; i < nbSeqs; ++i) { SeqDef seq = seqStore->sequencesStart[i]; matchBytes += seq.mlBase + MINMATCH; if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { matchBytes += 0x10000; } } return matchBytes; } /* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx). * Stores the result in resultSeqStore. */ static void ZSTD_deriveSeqStoreChunk(SeqStore_t* resultSeqStore, const SeqStore_t* originalSeqStore, size_t startIdx, size_t endIdx) { *resultSeqStore = *originalSeqStore; if (startIdx > 0) { resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); } /* Move longLengthPos into the correct position if necessary */ if (originalSeqStore->longLengthType != ZSTD_llt_none) { if (originalSeqStore->longLengthPos < startIdx || originalSeqStore->longLengthPos > endIdx) { resultSeqStore->longLengthType = ZSTD_llt_none; } else { resultSeqStore->longLengthPos -= (U32)startIdx; } } resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { /* This accounts for possible last literals if the derived chunk reaches the end of the block */ assert(resultSeqStore->lit == originalSeqStore->lit); } else { size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); resultSeqStore->lit = resultSeqStore->litStart + literalsBytes; } resultSeqStore->llCode += startIdx; resultSeqStore->mlCode += startIdx; resultSeqStore->ofCode += startIdx; } /* * Returns the raw offset represented by the combination of offBase, ll0, and repcode history. * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq(). */ static U32 ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) { U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */ assert(OFFBASE_IS_REPCODE(offBase)); if (adjustedRepCode == ZSTD_REP_NUM) { assert(ll0); /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 * This is only valid if it results in a valid offset value, aka > 0. * Note : it may happen that `rep[0]==1` in exceptional circumstances. * In which case this function will return 0, which is an invalid offset. * It's not an issue though, since this value will be * compared and discarded within ZSTD_seqStore_resolveOffCodes(). */ return rep[0] - 1; } return rep[adjustedRepCode]; } /* * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise * due to emission of RLE/raw blocks that disturb the offset history, * and replaces any repcodes within the seqStore that may be invalid. * * dRepcodes are updated as would be on the decompression side. * cRepcodes are updated exactly in accordance with the seqStore. * * Note : this function assumes seq->offBase respects the following numbering scheme : * 0 : invalid * 1-3 : repcode 1-3 * 4+ : real_offset+3 */ static void ZSTD_seqStore_resolveOffCodes(Repcodes_t* const dRepcodes, Repcodes_t* const cRepcodes, const SeqStore_t* const seqStore, U32 const nbSeq) { U32 idx = 0; U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; for (; idx < nbSeq; ++idx) { SeqDef* const seq = seqStore->sequencesStart + idx; U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); U32 const offBase = seq->offBase; assert(offBase > 0); if (OFFBASE_IS_REPCODE(offBase)) { U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0); U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0); /* Adjust simulated decompression repcode history if we come across a mismatch. Replace * the repcode with the offset it actually references, determined by the compression * repcode history. */ if (dRawOffset != cRawOffset) { seq->offBase = OFFSET_TO_OFFBASE(cRawOffset); } } /* Compression repcode history is always updated with values directly from the unmodified seqStore. * Decompression repcode history may use modified seq->offset value taken from compression repcode history. */ ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0); ZSTD_updateRep(cRepcodes->rep, offBase, ll0); } } /* ZSTD_compressSeqStore_singleBlock(): * Compresses a seqStore into a block with a block header, into the buffer dst. * * Returns the total size of that block (including header) or a ZSTD error code. */ static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, const SeqStore_t* const seqStore, Repcodes_t* const dRep, Repcodes_t* const cRep, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock, U32 isPartition) { const U32 rleMaxLength = 25; BYTE* op = (BYTE*)dst; const BYTE* ip = (const BYTE*)src; size_t cSize; size_t cSeqsSize; /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */ Repcodes_t const dRepOriginal = *dRep; DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock"); if (isPartition) ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "Block header doesn't fit"); cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, &zc->appliedParams, op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, srcSize, zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, zc->bmi2); FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); if (!zc->isFirstBlock && cSeqsSize < rleMaxLength && ZSTD_isRLE((BYTE const*)src, srcSize)) { /* We don't want to emit our first block as a RLE even if it qualifies because * doing so will cause the decoder (cli only) to throw a "should consume all input error." * This is only an issue for zstd <= v1.4.3 */ cSeqsSize = 1; } /* Sequence collection not supported when block splitting */ if (zc->seqCollector.collectSequences) { FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed"); ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } if (cSeqsSize == 0) { cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); FORWARD_IF_ERROR(cSize, "Nocompress block failed"); DEBUGLOG(5, "Writing out nocompress block, size: %zu", cSize); *dRep = dRepOriginal; /* reset simulated decompression repcode history */ } else if (cSeqsSize == 1) { cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); FORWARD_IF_ERROR(cSize, "RLE compress block failed"); DEBUGLOG(5, "Writing out RLE block, size: %zu", cSize); *dRep = dRepOriginal; /* reset simulated decompression repcode history */ } else { ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); cSize = ZSTD_blockHeaderSize + cSeqsSize; DEBUGLOG(5, "Writing out compressed block, size: %zu", cSize); } if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; return cSize; } /* Struct to keep track of where we are in our recursive calls. */ typedef struct { U32* splitLocations; /* Array of split indices */ size_t idx; /* The current index within splitLocations being worked on */ } seqStoreSplits; #define MIN_SEQUENCES_BLOCK_SPLITTING 300 /* Helper function to perform the recursive search for block splits. * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. * If advantageous to split, then we recurse down the two sub-blocks. * If not, or if an error occurred in estimation, then we do not recurse. * * Note: The recursion depth is capped by a heuristic minimum number of sequences, * defined by MIN_SEQUENCES_BLOCK_SPLITTING. * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). * In practice, recursion depth usually doesn't go beyond 4. * * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize * maximum of 128 KB, this value is actually impossible to reach. */ static void ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, ZSTD_CCtx* zc, const SeqStore_t* origSeqStore) { SeqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; SeqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; SeqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; size_t estimatedOriginalSize; size_t estimatedFirstHalfSize; size_t estimatedSecondHalfSize; size_t midIdx = (startIdx + endIdx)/2; DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); assert(endIdx >= startIdx); if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx); return; } ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { return; } if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { DEBUGLOG(5, "split decided at seqNb:%zu", midIdx); ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); splits->splitLocations[splits->idx] = (U32)midIdx; splits->idx++; ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqStore); } } /* Base recursive function. * Populates a table with intra-block partition indices that can improve compression ratio. * * @return: number of splits made (which equals the size of the partition table - 1). */ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { seqStoreSplits splits; splits.splitLocations = partitions; splits.idx = 0; if (nbSeq <= 4) { DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq); /* Refuse to try and split anything with less than 4 sequences */ return 0; } ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore); splits.splitLocations[splits.idx] = nbSeq; DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits.idx+1); return splits.idx; } /* ZSTD_compressBlock_splitBlock(): * Attempts to split a given block into multiple blocks to improve compression ratio. * * Returns combined size of all blocks (which includes headers), or a ZSTD error code. */ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) { size_t cSize = 0; const BYTE* ip = (const BYTE*)src; BYTE* op = (BYTE*)dst; size_t i = 0; size_t srcBytesTotal = 0; U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ SeqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; SeqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two * separate repcode histories that simulate repcode history on compression and decompression side, * and use the histories to determine whether we must replace a particular repcode with its raw offset. * * 1) cRep gets updated for each partition, regardless of whether the block was emitted as uncompressed * or RLE. This allows us to retrieve the offset value that an invalid repcode references within * a nocompress/RLE block. * 2) dRep gets updated only for compressed partitions, and when a repcode gets replaced, will use * the replacement offset value rather than the original repcode to update the repcode history. * dRep also will be the final repcode history sent to the next block. * * See ZSTD_seqStore_resolveOffCodes() for more details. */ Repcodes_t dRep; Repcodes_t cRep; ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ZSTD_memset(nextSeqStore, 0, sizeof(SeqStore_t)); DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate); if (numSplits == 0) { size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, &dRep, &cRep, op, dstCapacity, ip, blockSize, lastBlock, 0 /* isPartition */); FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); assert(zc->blockSizeMax <= ZSTD_BLOCKSIZE_MAX); assert(cSizeSingleBlock <= zc->blockSizeMax + ZSTD_blockHeaderSize); return cSizeSingleBlock; } ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); for (i = 0; i <= numSplits; ++i) { size_t cSizeChunk; U32 const lastPartition = (i == numSplits); U32 lastBlockEntireSrc = 0; size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); srcBytesTotal += srcBytes; if (lastPartition) { /* This is the final partition, need to account for possible last literals */ srcBytes += blockSize - srcBytesTotal; lastBlockEntireSrc = lastBlock; } else { ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]); } cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, currSeqStore, &dRep, &cRep, op, dstCapacity, ip, srcBytes, lastBlockEntireSrc, 1 /* isPartition */); DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); ip += srcBytes; op += cSizeChunk; dstCapacity -= cSizeChunk; cSize += cSizeChunk; *currSeqStore = *nextSeqStore; assert(cSizeChunk <= zc->blockSizeMax + ZSTD_blockHeaderSize); } /* cRep and dRep may have diverged during the compression. * If so, we use the dRep repcodes for the next block. */ ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(Repcodes_t)); return cSize; } static size_t ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) { U32 nbSeq; size_t cSize; DEBUGLOG(5, "ZSTD_compressBlock_splitBlock"); assert(zc->appliedParams.postBlockSplitter == ZSTD_ps_enable); { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); if (bss == ZSTDbss_noCompress) { if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); DEBUGLOG(5, "ZSTD_compressBlock_splitBlock: Nocompress block"); return cSize; } nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart); } cSize = ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity, src, srcSize, lastBlock, nbSeq); FORWARD_IF_ERROR(cSize, "Splitting blocks failed!"); return cSize; } static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 frame) { /* This is an estimated upper bound for the length of an rle block. * This isn't the actual upper bound. * Finding the real threshold needs further investigation. */ const U32 rleMaxLength = 25; size_t cSize; const BYTE* ip = (const BYTE*)src; BYTE* op = (BYTE*)dst; DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate); { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); if (bss == ZSTDbss_noCompress) { RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); cSize = 0; goto out; } } if (zc->seqCollector.collectSequences) { FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed"); ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } /* encode sequences and literals */ cSize = ZSTD_entropyCompressSeqStore(&zc->seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, &zc->appliedParams, dst, dstCapacity, srcSize, zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, zc->bmi2); if (frame && /* We don't want to emit our first block as a RLE even if it qualifies because * doing so will cause the decoder (cli only) to throw a "should consume all input error." * This is only an issue for zstd <= v1.4.3 */ !zc->isFirstBlock && cSize < rleMaxLength && ZSTD_isRLE(ip, srcSize)) { cSize = 1; op[0] = ip[0]; } out: if (!ZSTD_isError(cSize) && cSize > 1) { ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); } /* We check that dictionaries have offset codes available for the first * block. After the first block, the offcode table might not have large * enough codes to represent the offsets in the data. */ if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; return cSize; } static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const size_t bss, U32 lastBlock) { DEBUGLOG(6, "Attempting ZSTD_compressSuperBlock()"); if (bss == ZSTDbss_compress) { if (/* We don't want to emit our first block as a RLE even if it qualifies because * doing so will cause the decoder (cli only) to throw a "should consume all input error." * This is only an issue for zstd <= v1.4.3 */ !zc->isFirstBlock && ZSTD_maybeRLE(&zc->seqStore) && ZSTD_isRLE((BYTE const*)src, srcSize)) { return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const*)src, srcSize, lastBlock); } /* Attempt superblock compression. * * Note that compressed size of ZSTD_compressSuperBlock() is not bound by the * standard ZSTD_compressBound(). This is a problem, because even if we have * space now, taking an extra byte now could cause us to run out of space later * and violate ZSTD_compressBound(). * * Define blockBound(blockSize) = blockSize + ZSTD_blockHeaderSize. * * In order to respect ZSTD_compressBound() we must attempt to emit a raw * uncompressed block in these cases: * * cSize == 0: Return code for an uncompressed block. * * cSize == dstSize_tooSmall: We may have expanded beyond blockBound(srcSize). * ZSTD_noCompressBlock() will return dstSize_tooSmall if we are really out of * output space. * * cSize >= blockBound(srcSize): We have expanded the block too much so * emit an uncompressed block. */ { size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); if (cSize != ERROR(dstSize_tooSmall)) { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return cSize; } } } } /* if (bss == ZSTDbss_compress)*/ DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); /* Superblock compression failed, attempt to emit a single no compress block. * The decoder will be able to stream this block since it is uncompressed. */ return ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); } static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) { size_t cSize = 0; const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); DEBUGLOG(5, "ZSTD_compressBlock_targetCBlockSize (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u, srcSize=%zu)", (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate, srcSize); FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); cSize = ZSTD_compressBlock_targetCBlockSize_body(zc, dst, dstCapacity, src, srcSize, bss, lastBlock); FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize_body failed"); if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; return cSize; } static void ZSTD_overflowCorrectIfNeeded(ZSTD_MatchState_t* ms, ZSTD_cwksp* ws, ZSTD_CCtx_params const* params, void const* ip, void const* iend) { U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); U32 const maxDist = (U32)1 << params->cParams.windowLog; if (ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, ms->loadedDictEnd, ip, iend)) { U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip); ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30); ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30); ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); ZSTD_cwksp_mark_tables_dirty(ws); ZSTD_reduceIndex(ms, params, correction); ZSTD_cwksp_mark_tables_clean(ws); if (ms->nextToUpdate < correction) ms->nextToUpdate = 0; else ms->nextToUpdate -= correction; /* invalidate dictionaries on overflow correction */ ms->loadedDictEnd = 0; ms->dictMatchState = NULL; } } #include "zstd_preSplit.h" static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, int splitLevel, ZSTD_strategy strat, S64 savings) { /* split level based on compression strategy, from `fast` to `btultra2` */ static const int splitLevels[] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 4 }; /* note: conservatively only split full blocks (128 KB) currently. * While it's possible to go lower, let's keep it simple for a first implementation. * Besides, benefits of splitting are reduced when blocks are already small. */ if (srcSize < 128 KB || blockSizeMax < 128 KB) return MIN(srcSize, blockSizeMax); /* do not split incompressible data though: * require verified savings to allow pre-splitting. * Note: as a consequence, the first full block is not split. */ if (savings < 3) { DEBUGLOG(6, "don't attempt splitting: savings (%i) too low", (int)savings); return 128 KB; } /* apply @splitLevel, or use default value (which depends on @strat). * note that splitting heuristic is still conditioned by @savings >= 3, * so the first block will not reach this code path */ if (splitLevel == 1) return 128 KB; if (splitLevel == 0) { assert(ZSTD_fast <= strat && strat <= ZSTD_btultra2); splitLevel = splitLevels[strat]; } else { assert(2 <= splitLevel && splitLevel <= 6); splitLevel -= 2; } return ZSTD_splitBlock(src, blockSizeMax, splitLevel, cctx->tmpWorkspace, cctx->tmpWkspSize); } /*! ZSTD_compress_frameChunk() : * Compress a chunk of data into one or multiple blocks. * All blocks will be terminated, all input will be consumed. * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. * Frame is supposed already started (header already produced) * @return : compressed size, or an error code */ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastFrameChunk) { size_t blockSizeMax = cctx->blockSizeMax; size_t remaining = srcSize; const BYTE* ip = (const BYTE*)src; BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog; S64 savings = (S64)cctx->consumedSrcSize - (S64)cctx->producedCSize; assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX); DEBUGLOG(5, "ZSTD_compress_frameChunk (srcSize=%u, blockSizeMax=%u)", (unsigned)srcSize, (unsigned)blockSizeMax); if (cctx->appliedParams.fParams.checksumFlag && srcSize) xxh64_update(&cctx->xxhState, src, srcSize); while (remaining) { ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; size_t const blockSize = ZSTD_optimalBlockSize(cctx, ip, remaining, blockSizeMax, cctx->appliedParams.preBlockSplitter_level, cctx->appliedParams.cParams.strategy, savings); U32 const lastBlock = lastFrameChunk & (blockSize == remaining); assert(blockSize <= remaining); /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding * additional 1. We need to revisit and change this logic to be more consistent */ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, dstSize_tooSmall, "not enough space to store compressed block"); ZSTD_overflowCorrectIfNeeded( ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize); ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); /* Ensure hash/chain table insertion resumes no sooner than lowlimit */ if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit; { size_t cSize; if (ZSTD_useTargetCBlockSize(&cctx->appliedParams)) { cSize = ZSTD_compressBlock_targetCBlockSize(cctx, op, dstCapacity, ip, blockSize, lastBlock); FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed"); assert(cSize > 0); assert(cSize <= blockSize + ZSTD_blockHeaderSize); } else if (ZSTD_blockSplitterEnabled(&cctx->appliedParams)) { cSize = ZSTD_compressBlock_splitBlock(cctx, op, dstCapacity, ip, blockSize, lastBlock); FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock failed"); assert(cSize > 0 || cctx->seqCollector.collectSequences == 1); } else { cSize = ZSTD_compressBlock_internal(cctx, op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, ip, blockSize, 1 /* frame */); FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_internal failed"); if (cSize == 0) { /* block is not compressible */ cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); } else { U32 const cBlockHeader = cSize == 1 ? lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); MEM_writeLE24(op, cBlockHeader); cSize += ZSTD_blockHeaderSize; } } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ /* @savings is employed to ensure that splitting doesn't worsen expansion of incompressible data. * Without splitting, the maximum expansion is 3 bytes per full block. * An adversarial input could attempt to fudge the split detector, * and make it split incompressible data, resulting in more block headers. * Note that, since ZSTD_COMPRESSBOUND() assumes a worst case scenario of 1KB per block, * and the splitter never creates blocks that small (current lower limit is 8 KB), * there is already no risk to expand beyond ZSTD_COMPRESSBOUND() limit. * But if the goal is to not expand by more than 3-bytes per 128 KB full block, * then yes, it becomes possible to make the block splitter oversplit incompressible data. * Using @savings, we enforce an even more conservative condition, * requiring the presence of enough savings (at least 3 bytes) to authorize splitting, * otherwise only full blocks are used. * But being conservative is fine, * since splitting barely compressible blocks is not fruitful anyway */ savings += (S64)blockSize - (S64)cSize; ip += blockSize; assert(remaining >= blockSize); remaining -= blockSize; op += cSize; assert(dstCapacity >= cSize); dstCapacity -= cSize; cctx->isFirstBlock = 0; DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u", (unsigned)cSize); } } if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending; return (size_t)(op-ostart); } static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID) { BYTE* const op = (BYTE*)dst; U32 const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536); /* 0-3 */ U32 const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength; /* 0-3 */ U32 const checksumFlag = params->fParams.checksumFlag>0; U32 const windowSize = (U32)1 << params->cParams.windowLog; U32 const singleSegment = params->fParams.contentSizeFlag && (windowSize >= pledgedSrcSize); BYTE const windowLogByte = (BYTE)((params->cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3); U32 const fcsCode = params->fParams.contentSizeFlag ? (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : 0; /* 0-3 */ BYTE const frameHeaderDescriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) ); size_t pos=0; assert(!(params->fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)); RETURN_ERROR_IF(dstCapacity < ZSTD_FRAMEHEADERSIZE_MAX, dstSize_tooSmall, "dst buf is too small to fit worst-case frame header size."); DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u", !params->fParams.noDictIDFlag, (unsigned)dictID, (unsigned)dictIDSizeCode); if (params->format == ZSTD_f_zstd1) { MEM_writeLE32(dst, ZSTD_MAGICNUMBER); pos = 4; } op[pos++] = frameHeaderDescriptionByte; if (!singleSegment) op[pos++] = windowLogByte; switch(dictIDSizeCode) { default: assert(0); /* impossible */ ZSTD_FALLTHROUGH; case 0 : break; case 1 : op[pos] = (BYTE)(dictID); pos++; break; case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break; case 3 : MEM_writeLE32(op+pos, dictID); pos+=4; break; } switch(fcsCode) { default: assert(0); /* impossible */ ZSTD_FALLTHROUGH; case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break; case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break; case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break; case 3 : MEM_writeLE64(op+pos, (U64)(pledgedSrcSize)); pos+=8; break; } return pos; } /* ZSTD_writeSkippableFrame_advanced() : * Writes out a skippable frame with the specified magic number variant (16 are supported), * from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15, and the desired source data. * * Returns the total number of bytes written, or a ZSTD error code. */ size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned magicVariant) { BYTE* op = (BYTE*)dst; RETURN_ERROR_IF(dstCapacity < srcSize + ZSTD_SKIPPABLEHEADERSIZE /* Skippable frame overhead */, dstSize_tooSmall, "Not enough room for skippable frame"); RETURN_ERROR_IF(srcSize > (unsigned)0xFFFFFFFF, srcSize_wrong, "Src size too large for skippable frame"); RETURN_ERROR_IF(magicVariant > 15, parameter_outOfBound, "Skippable frame magic number variant not supported"); MEM_writeLE32(op, (U32)(ZSTD_MAGIC_SKIPPABLE_START + magicVariant)); MEM_writeLE32(op+4, (U32)srcSize); ZSTD_memcpy(op+8, src, srcSize); return srcSize + ZSTD_SKIPPABLEHEADERSIZE; } /* ZSTD_writeLastEmptyBlock() : * output an empty Block with end-of-frame mark to complete a frame * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h)) * or an error code if `dstCapacity` is too small (<ZSTD_blockHeaderSize) */ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity) { RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "dst buf is too small to write frame trailer empty block."); { U32 const cBlockHeader24 = 1 /*lastBlock*/ + (((U32)bt_raw)<<1); /* 0 size */ MEM_writeLE24(dst, cBlockHeader24); return ZSTD_blockHeaderSize; } } void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) { assert(cctx->stage == ZSTDcs_init); assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable); cctx->externSeqStore.seq = seq; cctx->externSeqStore.size = nbSeq; cctx->externSeqStore.capacity = nbSeq; cctx->externSeqStore.pos = 0; cctx->externSeqStore.posInSequence = 0; } static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 frame, U32 lastFrameChunk) { ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; size_t fhSize = 0; DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u", cctx->stage, (unsigned)srcSize); RETURN_ERROR_IF(cctx->stage==ZSTDcs_created, stage_wrong, "missing init (ZSTD_compressBegin)"); if (frame && (cctx->stage==ZSTDcs_init)) { fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, cctx->pledgedSrcSizePlusOne-1, cctx->dictID); FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); assert(fhSize <= dstCapacity); dstCapacity -= fhSize; dst = (char*)dst + fhSize; cctx->stage = ZSTDcs_ongoing; } if (!srcSize) return fhSize; /* do not generate an empty block if no input */ if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) { ms->forceNonContiguous = 0; ms->nextToUpdate = ms->window.dictLimit; } if (cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0); } if (!frame) { /* overflow check and correction for block mode */ ZSTD_overflowCorrectIfNeeded( ms, &cctx->workspace, &cctx->appliedParams, src, (BYTE const*)src + srcSize); } DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSizeMax); { size_t const cSize = frame ? ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */); FORWARD_IF_ERROR(cSize, "%s", frame ? "ZSTD_compress_frameChunk failed" : "ZSTD_compressBlock_internal failed"); cctx->consumedSrcSize += srcSize; cctx->producedCSize += (cSize + fhSize); assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0)); if (cctx->pledgedSrcSizePlusOne != 0) { /* control src size */ ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1); RETURN_ERROR_IF( cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne, srcSize_wrong, "error : pledgedSrcSize = %u, while realSrcSize >= %u", (unsigned)cctx->pledgedSrcSizePlusOne-1, (unsigned)cctx->consumedSrcSize); } return cSize + fhSize; } } size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); } /* NOTE: Must just wrap ZSTD_compressContinue_public() */ size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize); } static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx) { ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; assert(!ZSTD_checkCParams(cParams)); return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); } /* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */ size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) { return ZSTD_getBlockSize_deprecated(cctx); } /* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx); RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); } /* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize); } /*! ZSTD_loadDictionaryContent() : * @return : 0, or an error code */ static size_t ZSTD_loadDictionaryContent(ZSTD_MatchState_t* ms, ldmState_t* ls, ZSTD_cwksp* ws, ZSTD_CCtx_params const* params, const void* src, size_t srcSize, ZSTD_dictTableLoadMethod_e dtlm, ZSTD_tableFillPurpose_e tfp) { const BYTE* ip = (const BYTE*) src; const BYTE* const iend = ip + srcSize; int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; /* Assert that the ms params match the params we're being given */ ZSTD_assertEqualCParams(params->cParams, ms->cParams); { /* Ensure large dictionaries can't cause index overflow */ /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. * Dictionaries right at the edge will immediately trigger overflow * correction, but I don't want to insert extra constraints here. */ U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX; int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams); if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) { /* Some dictionary matchfinders in zstd use "short cache", * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each * CDict hashtable entry as a tag rather than as part of an index. * When short cache is used, we need to truncate the dictionary * so that its indices don't overlap with the tag. */ U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX; maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize); assert(!loadLdmDict); } /* If the dictionary is too large, only load the suffix of the dictionary. */ if (srcSize > maxDictSize) { ip = iend - maxDictSize; src = ip; srcSize = maxDictSize; } } if (srcSize > ZSTD_CHUNKSIZE_MAX) { /* We must have cleared our windows when our source is this large. */ assert(ZSTD_window_isEmpty(ms->window)); if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); } ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); DEBUGLOG(4, "ZSTD_loadDictionaryContent: useRowMatchFinder=%d", (int)params->useRowMatchFinder); if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ DEBUGLOG(4, "ZSTD_loadDictionaryContent: Trigger loadLdmDict"); ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); DEBUGLOG(4, "ZSTD_loadDictionaryContent: ZSTD_ldm_fillHashTable completes"); } /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ { U32 maxDictSize = 1U << MIN(MAX(params->cParams.hashLog + 3, params->cParams.chainLog + 1), 31); if (srcSize > maxDictSize) { ip = iend - maxDictSize; src = ip; srcSize = maxDictSize; } } ms->nextToUpdate = (U32)(ip - ms->window.base); ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); ms->forceNonContiguous = params->deterministicRefPrefix; if (srcSize <= HASH_READ_SIZE) return 0; ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); switch(params->cParams.strategy) { case ZSTD_fast: ZSTD_fillHashTable(ms, iend, dtlm, tfp); break; case ZSTD_dfast: #ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); #else assert(0); /* shouldn't be called: cparams should've been adjusted. */ #endif break; case ZSTD_greedy: case ZSTD_lazy: case ZSTD_lazy2: #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) assert(srcSize >= HASH_READ_SIZE); if (ms->dedicatedDictSearch) { assert(ms->chainTable != NULL); ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_READ_SIZE); } else { assert(params->useRowMatchFinder != ZSTD_ps_auto); if (params->useRowMatchFinder == ZSTD_ps_enable) { size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog); ZSTD_memset(ms->tagTable, 0, tagTableSize); ZSTD_row_update(ms, iend-HASH_READ_SIZE); DEBUGLOG(4, "Using row-based hash table for lazy dict"); } else { ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE); DEBUGLOG(4, "Using chain-based hash table for lazy dict"); } } #else assert(0); /* shouldn't be called: cparams should've been adjusted. */ #endif break; case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ case ZSTD_btopt: case ZSTD_btultra: case ZSTD_btultra2: #if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) assert(srcSize >= HASH_READ_SIZE); DEBUGLOG(4, "Fill %u bytes into the Binary Tree", (unsigned)srcSize); ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); #else assert(0); /* shouldn't be called: cparams should've been adjusted. */ #endif break; default: assert(0); /* not possible : not a valid strategy id */ } ms->nextToUpdate = (U32)(iend - ms->window.base); return 0; } /* Dictionaries that assign zero probability to symbols that show up causes problems * when FSE encoding. Mark dictionaries with zero probability symbols as FSE_repeat_check * and only dictionaries with 100% valid symbols can be assumed valid. */ static FSE_repeat ZSTD_dictNCountRepeat(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) { U32 s; if (dictMaxSymbolValue < maxSymbolValue) { return FSE_repeat_check; } for (s = 0; s <= maxSymbolValue; ++s) { if (normalizedCounter[s] == 0) { return FSE_repeat_check; } } return FSE_repeat_valid; } size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, const void* const dict, size_t dictSize) { short offcodeNCount[MaxOff+1]; unsigned offcodeMaxValue = MaxOff; const BYTE* dictPtr = (const BYTE*)dict; /* skip magic num and dict ID */ const BYTE* const dictEnd = dictPtr + dictSize; dictPtr += 8; bs->entropy.huf.repeatMode = HUF_repeat_check; { unsigned maxSymbolValue = 255; unsigned hasZeroWeights = 1; size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, (size_t)(dictEnd-dictPtr), &hasZeroWeights); /* We only set the loaded table as valid if it contains all non-zero * weights. Otherwise, we set it to check */ if (!hasZeroWeights && maxSymbolValue == 255) bs->entropy.huf.repeatMode = HUF_repeat_valid; RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); dictPtr += hufHeaderSize; } { unsigned offcodeLog; size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr)); RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); /* fill all offset symbols to avoid garbage at end of table */ RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( bs->entropy.fse.offcodeCTable, offcodeNCount, MaxOff, offcodeLog, workspace, HUF_WORKSPACE_SIZE)), dictionary_corrupted, ""); /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */ dictPtr += offcodeHeaderSize; } { short matchlengthNCount[MaxML+1]; unsigned matchlengthMaxValue = MaxML, matchlengthLog; size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( bs->entropy.fse.matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, workspace, HUF_WORKSPACE_SIZE)), dictionary_corrupted, ""); bs->entropy.fse.matchlength_repeatMode = ZSTD_dictNCountRepeat(matchlengthNCount, matchlengthMaxValue, MaxML); dictPtr += matchlengthHeaderSize; } { short litlengthNCount[MaxLL+1]; unsigned litlengthMaxValue = MaxLL, litlengthLog; size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( bs->entropy.fse.litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, workspace, HUF_WORKSPACE_SIZE)), dictionary_corrupted, ""); bs->entropy.fse.litlength_repeatMode = ZSTD_dictNCountRepeat(litlengthNCount, litlengthMaxValue, MaxLL); dictPtr += litlengthHeaderSize; } RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, ""); bs->rep[0] = MEM_readLE32(dictPtr+0); bs->rep[1] = MEM_readLE32(dictPtr+4); bs->rep[2] = MEM_readLE32(dictPtr+8); dictPtr += 12; { size_t const dictContentSize = (size_t)(dictEnd - dictPtr); U32 offcodeMax = MaxOff; if (dictContentSize <= ((U32)-1) - 128 KB) { U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */ offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */ } /* All offset values <= dictContentSize + 128 KB must be representable for a valid table */ bs->entropy.fse.offcode_repeatMode = ZSTD_dictNCountRepeat(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)); /* All repCodes must be <= dictContentSize and != 0 */ { U32 u; for (u=0; u<3; u++) { RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, ""); RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, ""); } } } return (size_t)(dictPtr - (const BYTE*)dict); } /* Dictionary format : * See : * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#dictionary-format */ /*! ZSTD_loadZstdDictionary() : * @return : dictID, or an error code * assumptions : magic number supposed already checked * dictSize supposed >= 8 */ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_MatchState_t* ms, ZSTD_cwksp* ws, ZSTD_CCtx_params const* params, const void* dict, size_t dictSize, ZSTD_dictTableLoadMethod_e dtlm, ZSTD_tableFillPurpose_e tfp, void* workspace) { const BYTE* dictPtr = (const BYTE*)dict; const BYTE* const dictEnd = dictPtr + dictSize; size_t dictID; size_t eSize; ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog))); assert(dictSize >= 8); assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY); dictID = params->fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr + 4 /* skip magic number */ ); eSize = ZSTD_loadCEntropy(bs, workspace, dict, dictSize); FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed"); dictPtr += eSize; { size_t const dictContentSize = (size_t)(dictEnd - dictPtr); FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), ""); } return dictID; } /* ZSTD_compress_insertDictionary() : * @return : dictID, or an error code */ static size_t ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_MatchState_t* ms, ldmState_t* ls, ZSTD_cwksp* ws, const ZSTD_CCtx_params* params, const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, ZSTD_dictTableLoadMethod_e dtlm, ZSTD_tableFillPurpose_e tfp, void* workspace) { DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); if ((dict==NULL) || (dictSize<8)) { RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); return 0; } ZSTD_reset_compressedBlockState(bs); /* dict restricted modes */ if (dictContentType == ZSTD_dct_rawContent) return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp); if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { if (dictContentType == ZSTD_dct_auto) { DEBUGLOG(4, "raw content dictionary detected"); return ZSTD_loadDictionaryContent( ms, ls, ws, params, dict, dictSize, dtlm, tfp); } RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); assert(0); /* impossible */ } /* dict as full zstd dictionary */ return ZSTD_loadZstdDictionary( bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace); } #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL) /*! ZSTD_compressBegin_internal() : * Assumption : either @dict OR @cdict (or none) is non-NULL, never both * @return : 0, or an error code */ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, ZSTD_dictTableLoadMethod_e dtlm, const ZSTD_CDict* cdict, const ZSTD_CCtx_params* params, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { size_t const dictContentSize = cdict ? cdict->dictContentSize : dictSize; DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog); /* params are supposed to be fully validated at this point */ assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); assert(!((dict) && (cdict))); /* either dict or cdict, not both */ if ( (cdict) && (cdict->dictContentSize > 0) && ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN || cdict->compressionLevel == 0) && (params->attachDictPref != ZSTD_dictForceLoad) ) { return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff); } FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, dictContentSize, ZSTDcrp_makeClean, zbuff) , ""); { size_t const dictID = cdict ? ZSTD_compress_insertDictionary( cctx->blockState.prevCBlock, &cctx->blockState.matchState, &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, cdict->dictContentSize, cdict->dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->tmpWorkspace) : ZSTD_compress_insertDictionary( cctx->blockState.prevCBlock, &cctx->blockState.matchState, &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->tmpWorkspace); FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <= UINT_MAX); cctx->dictID = (U32)dictID; cctx->dictContentSize = dictContentSize; } return 0; } size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, ZSTD_dictTableLoadMethod_e dtlm, const ZSTD_CDict* cdict, const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize) { DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params->cParams.windowLog); /* compression parameters verification and optimization */ FORWARD_IF_ERROR( ZSTD_checkCParams(params->cParams) , ""); return ZSTD_compressBegin_internal(cctx, dict, dictSize, dictContentType, dtlm, cdict, params, pledgedSrcSize, ZSTDb_not_buffered); } /*! ZSTD_compressBegin_advanced() : * @return : 0, or an error code */ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize) { ZSTD_CCtx_params cctxParams; ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, ZSTD_NO_CLEVEL); return ZSTD_compressBegin_advanced_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL /*cdict*/, &cctxParams, pledgedSrcSize); } static size_t ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { ZSTD_CCtx_params cctxParams; { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); } DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); } size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel); } size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) { return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel); } /*! ZSTD_writeEpilogue() : * Ends a frame. * @return : nb of bytes written into dst (or an error code) */ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) { BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; DEBUGLOG(4, "ZSTD_writeEpilogue"); RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); /* special case : empty frame */ if (cctx->stage == ZSTDcs_init) { size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); dstCapacity -= fhSize; op += fhSize; cctx->stage = ZSTDcs_ongoing; } if (cctx->stage != ZSTDcs_ending) { /* write one last empty block, make it the "last" block */ U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3); RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue"); MEM_writeLE24(op, cBlockHeader24); op += ZSTD_blockHeaderSize; dstCapacity -= ZSTD_blockHeaderSize; } if (cctx->appliedParams.fParams.checksumFlag) { U32 const checksum = (U32) xxh64_digest(&cctx->xxhState); RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum"); DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", (unsigned)checksum); MEM_writeLE32(op, checksum); op += 4; } cctx->stage = ZSTDcs_created; /* return to "created but no init" status */ return (size_t)(op-ostart); } void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) { (void)cctx; (void)extraCSize; } size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { size_t endResult; size_t const cSize = ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 1 /* last chunk */); FORWARD_IF_ERROR(cSize, "ZSTD_compressContinue_internal failed"); endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize); FORWARD_IF_ERROR(endResult, "ZSTD_writeEpilogue failed"); assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0)); if (cctx->pledgedSrcSizePlusOne != 0) { /* control src size */ ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1); DEBUGLOG(4, "end of frame : controlling src size"); RETURN_ERROR_IF( cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1, srcSize_wrong, "error : pledgedSrcSize = %u, while realSrcSize = %u", (unsigned)cctx->pledgedSrcSizePlusOne-1, (unsigned)cctx->consumedSrcSize); } ZSTD_CCtx_trace(cctx, endResult); return cSize + endResult; } /* NOTE: Must just wrap ZSTD_compressEnd_public() */ size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); } size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict,size_t dictSize, ZSTD_parameters params) { DEBUGLOG(4, "ZSTD_compress_advanced"); FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, ZSTD_NO_CLEVEL); return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctx->simpleApiParams); } /* Internal */ size_t ZSTD_compress_advanced_internal( ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict,size_t dictSize, const ZSTD_CCtx_params* params) { DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (unsigned)srcSize); FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, params, srcSize, ZSTDb_not_buffered) , ""); return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); } size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict, size_t dictSize, int compressionLevel) { { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict); assert(params.fParams.contentSizeFlag == 1); ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel); } DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize); return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctx->simpleApiParams); } size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel) { DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (unsigned)srcSize); assert(cctx != NULL); return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel); } size_t ZSTD_compress(void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel) { size_t result; ZSTD_CCtx* cctx = ZSTD_createCCtx(); RETURN_ERROR_IF(!cctx, memory_allocation, "ZSTD_createCCtx failed"); result = ZSTD_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel); ZSTD_freeCCtx(cctx); return result; } /* ===== Dictionary API ===== */ /*! ZSTD_estimateCDictSize_advanced() : * Estimate amount of memory that will be needed to create a dictionary with following arguments */ size_t ZSTD_estimateCDictSize_advanced( size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod) { DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict)); return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) /* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small * in case we are using DDS with row-hash. */ + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams), /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); } size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) { ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy); } size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict) { if (cdict==NULL) return 0; /* support sizeof on NULL */ DEBUGLOG(5, "sizeof(*cdict) : %u", (unsigned)sizeof(*cdict)); /* cdict may be in the workspace */ return (cdict->workspace.workspace == cdict ? 0 : sizeof(*cdict)) + ZSTD_cwksp_sizeof(&cdict->workspace); } static size_t ZSTD_initCDict_internal( ZSTD_CDict* cdict, const void* dictBuffer, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, ZSTD_CCtx_params params) { DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (unsigned)dictContentType); assert(!ZSTD_checkCParams(params.cParams)); cdict->matchState.cParams = params.cParams; cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch; if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) { cdict->dictContent = dictBuffer; } else { void *internalBuffer = ZSTD_cwksp_reserve_object(&cdict->workspace, ZSTD_cwksp_align(dictSize, sizeof(void*))); RETURN_ERROR_IF(!internalBuffer, memory_allocation, "NULL pointer!"); cdict->dictContent = internalBuffer; ZSTD_memcpy(internalBuffer, dictBuffer, dictSize); } cdict->dictContentSize = dictSize; cdict->dictContentType = dictContentType; cdict->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cdict->workspace, HUF_WORKSPACE_SIZE); /* Reset the state to no dictionary */ ZSTD_reset_compressedBlockState(&cdict->cBlockState); FORWARD_IF_ERROR(ZSTD_reset_matchState( &cdict->matchState, &cdict->workspace, ¶ms.cParams, params.useRowMatchFinder, ZSTDcrp_makeClean, ZSTDirp_reset, ZSTD_resetTarget_CDict), ""); /* (Maybe) load the dictionary * Skips loading the dictionary if it is < 8 bytes. */ { params.compressionLevel = ZSTD_CLEVEL_DEFAULT; params.fParams.contentSizeFlag = 1; { size_t const dictID = ZSTD_compress_insertDictionary( &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, ¶ms, cdict->dictContent, cdict->dictContentSize, dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace); FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <= (size_t)(U32)-1); cdict->dictID = (U32)dictID; } } return 0; } static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_compressionParameters cParams, ZSTD_ParamSwitch_e useRowMatchFinder, int enableDedicatedDictSearch, ZSTD_customMem customMem) { if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; DEBUGLOG(3, "ZSTD_createCDict_advanced_internal (dictSize=%u)", (unsigned)dictSize); { size_t const workspaceSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); void* const workspace = ZSTD_customMalloc(workspaceSize, customMem); ZSTD_cwksp ws; ZSTD_CDict* cdict; if (!workspace) { ZSTD_customFree(workspace, customMem); return NULL; } ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_dynamic_alloc); cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict)); assert(cdict != NULL); ZSTD_cwksp_move(&cdict->workspace, &ws); cdict->customMem = customMem; cdict->compressionLevel = ZSTD_NO_CLEVEL; /* signals advanced API usage */ cdict->useRowMatchFinder = useRowMatchFinder; return cdict; } } ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, ZSTD_compressionParameters cParams, ZSTD_customMem customMem) { ZSTD_CCtx_params cctxParams; ZSTD_memset(&cctxParams, 0, sizeof(cctxParams)); DEBUGLOG(3, "ZSTD_createCDict_advanced, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); ZSTD_CCtxParams_init(&cctxParams, 0); cctxParams.cParams = cParams; cctxParams.customMem = customMem; return ZSTD_createCDict_advanced2( dictBuffer, dictSize, dictLoadMethod, dictContentType, &cctxParams, customMem); } ZSTD_CDict* ZSTD_createCDict_advanced2( const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, const ZSTD_CCtx_params* originalCctxParams, ZSTD_customMem customMem) { ZSTD_CCtx_params cctxParams = *originalCctxParams; ZSTD_compressionParameters cParams; ZSTD_CDict* cdict; DEBUGLOG(3, "ZSTD_createCDict_advanced2, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); if (!customMem.customAlloc ^ !customMem.customFree) return NULL; if (cctxParams.enableDedicatedDictSearch) { cParams = ZSTD_dedicatedDictSearch_getCParams( cctxParams.compressionLevel, dictSize); ZSTD_overrideCParams(&cParams, &cctxParams.cParams); } else { cParams = ZSTD_getCParamsFromCCtxParams( &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); } if (!ZSTD_dedicatedDictSearch_isSupported(&cParams)) { /* Fall back to non-DDSS params */ cctxParams.enableDedicatedDictSearch = 0; cParams = ZSTD_getCParamsFromCCtxParams( &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); } DEBUGLOG(3, "ZSTD_createCDict_advanced2: DedicatedDictSearch=%u", cctxParams.enableDedicatedDictSearch); cctxParams.cParams = cParams; cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); cdict = ZSTD_createCDict_advanced_internal(dictSize, dictLoadMethod, cctxParams.cParams, cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, customMem); if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, dictLoadMethod, dictContentType, cctxParams) )) { ZSTD_freeCDict(cdict); return NULL; } return cdict; } ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel) { ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, cParams, ZSTD_defaultCMem); if (cdict) cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel; return cdict; } ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel) { ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, cParams, ZSTD_defaultCMem); if (cdict) cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel; return cdict; } size_t ZSTD_freeCDict(ZSTD_CDict* cdict) { if (cdict==NULL) return 0; /* support free on NULL */ { ZSTD_customMem const cMem = cdict->customMem; int cdictInWorkspace = ZSTD_cwksp_owns_buffer(&cdict->workspace, cdict); ZSTD_cwksp_free(&cdict->workspace, cMem); if (!cdictInWorkspace) { ZSTD_customFree(cdict, cMem); } return 0; } } /*! ZSTD_initStaticCDict_advanced() : * Generate a digested dictionary in provided memory area. * workspace: The memory area to emplace the dictionary into. * Provided pointer must 8-bytes aligned. * It must outlive dictionary usage. * workspaceSize: Use ZSTD_estimateCDictSize() * to determine how large workspace must be. * cParams : use ZSTD_getCParams() to transform a compression level * into its relevant cParams. * @return : pointer to ZSTD_CDict*, or NULL if error (size too small) * Note : there is no corresponding "free" function. * Since workspace was allocated externally, it must be freed externally. */ const ZSTD_CDict* ZSTD_initStaticCDict( void* workspace, size_t workspaceSize, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, ZSTD_compressionParameters cParams) { ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */ size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0); size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + matchStateSize; ZSTD_CDict* cdict; ZSTD_CCtx_params params; DEBUGLOG(4, "ZSTD_initStaticCDict (dictSize==%u)", (unsigned)dictSize); if ((size_t)workspace & 7) return NULL; /* 8-aligned */ { ZSTD_cwksp ws; ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc); cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict)); if (cdict == NULL) return NULL; ZSTD_cwksp_move(&cdict->workspace, &ws); } if (workspaceSize < neededSize) return NULL; ZSTD_CCtxParams_init(¶ms, 0); params.cParams = cParams; params.useRowMatchFinder = useRowMatchFinder; cdict->useRowMatchFinder = useRowMatchFinder; cdict->compressionLevel = ZSTD_NO_CLEVEL; if (ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, dictLoadMethod, dictContentType, params) )) return NULL; return cdict; } ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict) { assert(cdict != NULL); return cdict->matchState.cParams; } /*! ZSTD_getDictID_fromCDict() : * Provides the dictID of the dictionary loaded into `cdict`. * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict) { if (cdict==NULL) return 0; return cdict->dictID; } /* ZSTD_compressBegin_usingCDict_internal() : * Implementation of various ZSTD_compressBegin_usingCDict* functions. */ static size_t ZSTD_compressBegin_usingCDict_internal( ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) { ZSTD_CCtx_params cctxParams; DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal"); RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!"); /* Initialize the cctxParams from the cdict */ { ZSTD_parameters params; params.fParams = fParams; params.cParams = ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN || cdict->compressionLevel == 0 ) ? ZSTD_getCParamsFromCDict(cdict) : ZSTD_getCParams(cdict->compressionLevel, pledgedSrcSize, cdict->dictContentSize); ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, cdict->compressionLevel); } /* Increase window log to fit the entire dictionary and source if the * source size is known. Limit the increase to 19, which is the * window log for compression level 1 with the largest source size. */ if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) { U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19); U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1; cctxParams.cParams.windowLog = MAX(cctxParams.cParams.windowLog, limitedSrcLog); } return ZSTD_compressBegin_internal(cctx, NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, cdict, &cctxParams, pledgedSrcSize, ZSTDb_not_buffered); } /* ZSTD_compressBegin_usingCDict_advanced() : * This function is DEPRECATED. * cdict must be != NULL */ size_t ZSTD_compressBegin_usingCDict_advanced( ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) { return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pledgedSrcSize); } /* ZSTD_compressBegin_usingCDict() : * cdict must be != NULL */ size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) { ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); } size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) { return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict); } /*! ZSTD_compress_usingCDict_internal(): * Implementation of various ZSTD_compress_usingCDict* functions. */ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) { FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); } /*! ZSTD_compress_usingCDict_advanced(): * This function is DEPRECATED. */ size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) { return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); } /*! ZSTD_compress_usingCDict() : * Compression using a digested Dictionary. * Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. * Note that compression parameters are decided at CDict creation time * while frame parameters are hardcoded */ size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const ZSTD_CDict* cdict) { ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); } /* ****************************************************************** * Streaming ********************************************************************/ ZSTD_CStream* ZSTD_createCStream(void) { DEBUGLOG(3, "ZSTD_createCStream"); return ZSTD_createCStream_advanced(ZSTD_defaultCMem); } ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize) { return ZSTD_initStaticCCtx(workspace, workspaceSize); } ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem) { /* CStream and CCtx are now same object */ return ZSTD_createCCtx_advanced(customMem); } size_t ZSTD_freeCStream(ZSTD_CStream* zcs) { return ZSTD_freeCCtx(zcs); /* same object */ } /*====== Initialization ======*/ size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX; } size_t ZSTD_CStreamOutSize(void) { return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; } static ZSTD_CParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize) { if (cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) return ZSTD_cpm_attachDict; else return ZSTD_cpm_noAttachDict; } /* ZSTD_resetCStream(): * pledgedSrcSize == 0 means "unknown" */ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss) { /* temporary : 0 interpreted as "unknown" during transition period. * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. * 0 will be interpreted as "empty" in the future. */ U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (unsigned)pledgedSrcSize); FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); return 0; } /*! ZSTD_initCStream_internal() : * Note : for lib/compress only. Used by zstdmt_compress.c. * Assumption 1 : params are valid * Assumption 2 : either dict, or cdict, is defined, not both */ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, const void* dict, size_t dictSize, const ZSTD_CDict* cdict, const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize) { DEBUGLOG(4, "ZSTD_initCStream_internal"); FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); zcs->requestedParams = *params; assert(!((dict) && (cdict))); /* either dict or cdict, not both */ if (dict) { FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); } else { /* Dictionary is cleared if !cdict */ FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); } return 0; } /* ZSTD_initCStream_usingCDict_advanced() : * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams, unsigned long long pledgedSrcSize) { DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced"); FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); zcs->requestedParams.fParams = fParams; FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); return 0; } /* note : cdict must outlive compression session */ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict) { DEBUGLOG(4, "ZSTD_initCStream_usingCDict"); FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); return 0; } /* ZSTD_initCStream_advanced() : * pledgedSrcSize must be exact. * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. * dict is loaded with default parameters ZSTD_dct_auto and ZSTD_dlm_byCopy. */ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pss) { /* for compatibility with older programs relying on this behavior. * Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. * This line will be removed in the future. */ U64 const pledgedSrcSize = (pss==0 && params.fParams.contentSizeFlag==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; DEBUGLOG(4, "ZSTD_initCStream_advanced"); FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); ZSTD_CCtxParams_setZstdParams(&zcs->requestedParams, ¶ms); FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); return 0; } size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel) { DEBUGLOG(4, "ZSTD_initCStream_usingDict"); FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); return 0; } size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss) { /* temporary : 0 interpreted as "unknown" during transition period. * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. * 0 will be interpreted as "empty" in the future. */ U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; DEBUGLOG(4, "ZSTD_initCStream_srcSize"); FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); return 0; } size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) { DEBUGLOG(4, "ZSTD_initCStream"); FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); return 0; } /*====== Compression ======*/ static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) { if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { return cctx->blockSizeMax - cctx->stableIn_notConsumed; } assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; if (hintInSize==0) hintInSize = cctx->blockSizeMax; return hintInSize; } } /* ZSTD_compressStream_generic(): * internal function for all *compressStream*() variants * @return : hint size for next input to complete ongoing block */ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input, ZSTD_EndDirective const flushMode) { const char* const istart = (assert(input != NULL), (const char*)input->src); const char* const iend = (istart != NULL) ? istart + input->size : istart; const char* ip = (istart != NULL) ? istart + input->pos : istart; char* const ostart = (assert(output != NULL), (char*)output->dst); char* const oend = (ostart != NULL) ? ostart + output->size : ostart; char* op = (ostart != NULL) ? ostart + output->pos : ostart; U32 someMoreWork = 1; /* check expectations */ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos); assert(zcs != NULL); if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { assert(input->pos >= zcs->stableIn_notConsumed); input->pos -= zcs->stableIn_notConsumed; if (ip) ip -= zcs->stableIn_notConsumed; zcs->stableIn_notConsumed = 0; } if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { assert(zcs->inBuff != NULL); assert(zcs->inBuffSize > 0); } if (zcs->appliedParams.outBufferMode == ZSTD_bm_buffered) { assert(zcs->outBuff != NULL); assert(zcs->outBuffSize > 0); } if (input->src == NULL) assert(input->size == 0); assert(input->pos <= input->size); if (output->dst == NULL) assert(output->size == 0); assert(output->pos <= output->size); assert((U32)flushMode <= (U32)ZSTD_e_end); while (someMoreWork) { switch(zcs->streamStage) { case zcss_init: RETURN_ERROR(init_missing, "call ZSTD_initCStream() first!"); case zcss_load: if ( (flushMode == ZSTD_e_end) && ( (size_t)(oend-op) >= ZSTD_compressBound((size_t)(iend-ip)) /* Enough output space */ || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ && (zcs->inBuffPos == 0) ) { /* shortcut to compression pass directly into output buffer */ size_t const cSize = ZSTD_compressEnd_public(zcs, op, (size_t)(oend-op), ip, (size_t)(iend-ip)); DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); ip = iend; op += cSize; zcs->frameEnded = 1; ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); someMoreWork = 0; break; } /* complete loading into inBuffer in buffered mode */ if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos; size_t const loaded = ZSTD_limitCopy( zcs->inBuff + zcs->inBuffPos, toLoad, ip, (size_t)(iend-ip)); zcs->inBuffPos += loaded; if (ip) ip += loaded; if ( (flushMode == ZSTD_e_continue) && (zcs->inBuffPos < zcs->inBuffTarget) ) { /* not enough input to fill full block : stop here */ someMoreWork = 0; break; } if ( (flushMode == ZSTD_e_flush) && (zcs->inBuffPos == zcs->inToCompress) ) { /* empty */ someMoreWork = 0; break; } } else { assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); if ( (flushMode == ZSTD_e_continue) && ( (size_t)(iend - ip) < zcs->blockSizeMax) ) { /* can't compress a full block : stop here */ zcs->stableIn_notConsumed = (size_t)(iend - ip); ip = iend; /* pretend to have consumed input */ someMoreWork = 0; break; } if ( (flushMode == ZSTD_e_flush) && (ip == iend) ) { /* empty */ someMoreWork = 0; break; } } /* compress current block (note : this stage cannot be stopped in the middle) */ DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); { int const inputBuffered = (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered); void* cDst; size_t cSize; size_t oSize = (size_t)(oend-op); size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress : MIN((size_t)(iend - ip), zcs->blockSizeMax); if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) cDst = op; /* compress into output buffer, to skip flush stage */ else cDst = zcs->outBuff, oSize = zcs->outBuffSize; if (inputBuffered) { unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); cSize = lastBlock ? ZSTD_compressEnd_public(zcs, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize) : ZSTD_compressContinue_public(zcs, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize); FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); zcs->frameEnded = lastBlock; /* prepare next block */ zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSizeMax; if (zcs->inBuffTarget > zcs->inBuffSize) zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSizeMax; DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u", (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize); if (!lastBlock) assert(zcs->inBuffTarget <= zcs->inBuffSize); zcs->inToCompress = zcs->inBuffPos; } else { /* !inputBuffered, hence ZSTD_bm_stable */ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); cSize = lastBlock ? ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) : ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize); /* Consume the input prior to error checking to mirror buffered mode. */ if (ip) ip += iSize; FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); zcs->frameEnded = lastBlock; if (lastBlock) assert(ip == iend); } if (cDst == op) { /* no need to flush */ op += cSize; if (zcs->frameEnded) { DEBUGLOG(5, "Frame completed directly in outBuffer"); someMoreWork = 0; ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); } break; } zcs->outBuffContentSize = cSize; zcs->outBuffFlushedSize = 0; zcs->streamStage = zcss_flush; /* pass-through to flush stage */ } ZSTD_FALLTHROUGH; case zcss_flush: DEBUGLOG(5, "flush stage"); assert(zcs->appliedParams.outBufferMode == ZSTD_bm_buffered); { size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize; size_t const flushed = ZSTD_limitCopy(op, (size_t)(oend-op), zcs->outBuff + zcs->outBuffFlushedSize, toFlush); DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u", (unsigned)toFlush, (unsigned)(oend-op), (unsigned)flushed); if (flushed) op += flushed; zcs->outBuffFlushedSize += flushed; if (toFlush!=flushed) { /* flush not fully completed, presumably because dst is too small */ assert(op==oend); someMoreWork = 0; break; } zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0; if (zcs->frameEnded) { DEBUGLOG(5, "Frame completed on flush"); someMoreWork = 0; ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); break; } zcs->streamStage = zcss_load; break; } default: /* impossible */ assert(0); } } input->pos = (size_t)(ip - istart); output->pos = (size_t)(op - ostart); if (zcs->frameEnded) return 0; return ZSTD_nextInputSizeHint(zcs); } static size_t ZSTD_nextInputSizeHint_MTorST(const ZSTD_CCtx* cctx) { return ZSTD_nextInputSizeHint(cctx); } size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input) { FORWARD_IF_ERROR( ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue) , ""); return ZSTD_nextInputSizeHint_MTorST(zcs); } /* After a compression call set the expected input/output buffer. * This is validated at the start of the next compression call. */ static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) { DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)"); if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { cctx->expectedInBuffer = *input; } if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { cctx->expectedOutBufferSize = output->size - output->pos; } } /* Validate that the input/output buffers match the expectations set by * ZSTD_setBufferExpectations. */ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input, ZSTD_EndDirective endOp) { if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { ZSTD_inBuffer const expect = cctx->expectedInBuffer; if (expect.src != input->src || expect.pos != input->pos) RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!"); } (void)endOp; if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { size_t const outBufferSize = output->size - output->pos; if (cctx->expectedOutBufferSize != outBufferSize) RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!"); } return 0; } /* * If @endOp == ZSTD_e_end, @inSize becomes pledgedSrcSize. * Otherwise, it's ignored. * @return: 0 on success, or a ZSTD_error code otherwise. */ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, ZSTD_EndDirective endOp, size_t inSize) { ZSTD_CCtx_params params = cctx->requestedParams; ZSTD_prefixDict const prefixDict = cctx->prefixDict; FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */ assert(prefixDict.dict==NULL || cctx->cdict==NULL); /* only one can be set */ if (cctx->cdict && !cctx->localDict.cdict) { /* Let the cdict's compression level take priority over the requested params. * But do not take the cdict's compression level if the "cdict" is actually a localDict * generated from ZSTD_initLocalDict(). */ params.compressionLevel = cctx->cdict->compressionLevel; } DEBUGLOG(4, "ZSTD_CCtx_init_compressStream2 : transparent init stage"); if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ { size_t const dictSize = prefixDict.dict ? prefixDict.dictSize : (cctx->cdict ? cctx->cdict->dictContentSize : 0); ZSTD_CParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); params.cParams = ZSTD_getCParamsFromCCtxParams( ¶ms, cctx->pledgedSrcSizePlusOne-1, dictSize, mode); } params.postBlockSplitter = ZSTD_resolveBlockSplitterMode(params.postBlockSplitter, ¶ms.cParams); params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize); params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel); { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, ZSTD_dtlm_fast, cctx->cdict, ¶ms, pledgedSrcSize, ZSTDb_buffered) , ""); assert(cctx->appliedParams.nbWorkers == 0); cctx->inToCompress = 0; cctx->inBuffPos = 0; if (cctx->appliedParams.inBufferMode == ZSTD_bm_buffered) { /* for small input: avoid automatic flush on reaching end of block, since * it would require to add a 3-bytes null block to end frame */ cctx->inBuffTarget = cctx->blockSizeMax + (cctx->blockSizeMax == pledgedSrcSize); } else { cctx->inBuffTarget = 0; } cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0; cctx->streamStage = zcss_load; cctx->frameEnded = 0; } return 0; } /* @return provides a minimum amount of data remaining to be flushed from internal buffers */ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input, ZSTD_EndDirective endOp) { DEBUGLOG(5, "ZSTD_compressStream2, endOp=%u ", (unsigned)endOp); /* check conditions */ RETURN_ERROR_IF(output->pos > output->size, dstSize_tooSmall, "invalid output buffer"); RETURN_ERROR_IF(input->pos > input->size, srcSize_wrong, "invalid input buffer"); RETURN_ERROR_IF((U32)endOp > (U32)ZSTD_e_end, parameter_outOfBound, "invalid endDirective"); assert(cctx != NULL); /* transparent initialization stage */ if (cctx->streamStage == zcss_init) { size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */ size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed; if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */ && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */ && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */ if (cctx->stableIn_notConsumed) { /* not the first time */ /* check stable source guarantees */ RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer"); RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos"); } /* pretend input was consumed, to give a sense forward progress */ input->pos = input->size; /* save stable inBuffer, for later control, and flush/end */ cctx->expectedInBuffer = *input; /* but actually input wasn't consumed, so keep track of position from where compression shall resume */ cctx->stableIn_notConsumed += inputSize; /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */ return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */ } FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed"); ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ } /* end of transparent initialization stage */ FORWARD_IF_ERROR(ZSTD_checkBufferStability(cctx, output, input, endOp), "invalid buffers"); /* compression stage */ FORWARD_IF_ERROR( ZSTD_compressStream_generic(cctx, output, input, endOp) , ""); DEBUGLOG(5, "completed ZSTD_compressStream2"); ZSTD_setBufferExpectations(cctx, output, input); return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */ } size_t ZSTD_compressStream2_simpleArgs ( ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, size_t* dstPos, const void* src, size_t srcSize, size_t* srcPos, ZSTD_EndDirective endOp) { ZSTD_outBuffer output; ZSTD_inBuffer input; output.dst = dst; output.size = dstCapacity; output.pos = *dstPos; input.src = src; input.size = srcSize; input.pos = *srcPos; /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); *dstPos = output.pos; *srcPos = input.pos; return cErr; } } size_t ZSTD_compress2(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { ZSTD_bufferMode_e const originalInBufferMode = cctx->requestedParams.inBufferMode; ZSTD_bufferMode_e const originalOutBufferMode = cctx->requestedParams.outBufferMode; DEBUGLOG(4, "ZSTD_compress2 (srcSize=%u)", (unsigned)srcSize); ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only); /* Enable stable input/output buffers. */ cctx->requestedParams.inBufferMode = ZSTD_bm_stable; cctx->requestedParams.outBufferMode = ZSTD_bm_stable; { size_t oPos = 0; size_t iPos = 0; size_t const result = ZSTD_compressStream2_simpleArgs(cctx, dst, dstCapacity, &oPos, src, srcSize, &iPos, ZSTD_e_end); /* Reset to the original values. */ cctx->requestedParams.inBufferMode = originalInBufferMode; cctx->requestedParams.outBufferMode = originalOutBufferMode; FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); if (result != 0) { /* compression not completed, due to lack of output space */ assert(oPos == dstCapacity); RETURN_ERROR(dstSize_tooSmall, ""); } assert(iPos == srcSize); /* all input is expected consumed */ return oPos; } } /* ZSTD_validateSequence() : * @offBase : must use the format required by ZSTD_storeSeq() * @returns a ZSTD error code if sequence is not valid */ static size_t ZSTD_validateSequence(U32 offBase, U32 matchLength, U32 minMatch, size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) { U32 const windowSize = 1u << windowLog; /* posInSrc represents the amount of data the decoder would decode up to this point. * As long as the amount of data decoded is less than or equal to window size, offsets may be * larger than the total length of output decoded in order to reference the dict, even larger than * window size. After output surpasses windowSize, we're limited to windowSize offsets again. */ size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; RETURN_ERROR_IF(offBase > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); return 0; } /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) { U32 offBase = OFFSET_TO_OFFBASE(rawOffset); if (!ll0 && rawOffset == rep[0]) { offBase = REPCODE1_TO_OFFBASE; } else if (rawOffset == rep[1]) { offBase = REPCODE_TO_OFFBASE(2 - ll0); } else if (rawOffset == rep[2]) { offBase = REPCODE_TO_OFFBASE(3 - ll0); } else if (ll0 && rawOffset == rep[0] - 1) { offBase = REPCODE3_TO_OFFBASE; } return offBase; } /* This function scans through an array of ZSTD_Sequence, * storing the sequences it reads, until it reaches a block delimiter. * Note that the block delimiter includes the last literals of the block. * @blockSize must be == sum(sequence_lengths). * @returns @blockSize on success, and a ZSTD_error otherwise. */ static size_t ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, ZSTD_SequencePosition* seqPos, const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, const void* src, size_t blockSize, ZSTD_ParamSwitch_e externalRepSearch) { U32 idx = seqPos->idx; U32 const startIdx = idx; BYTE const* ip = (BYTE const*)(src); const BYTE* const iend = ip + blockSize; Repcodes_t updatedRepcodes; U32 dictSize; DEBUGLOG(5, "ZSTD_transferSequences_wBlockDelim (blockSize = %zu)", blockSize); if (cctx->cdict) { dictSize = (U32)cctx->cdict->dictContentSize; } else if (cctx->prefixDict.dict) { dictSize = (U32)cctx->prefixDict.dictSize; } else { dictSize = 0; } ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { U32 const litLength = inSeqs[idx].litLength; U32 const matchLength = inSeqs[idx].matchLength; U32 offBase; if (externalRepSearch == ZSTD_ps_disable) { offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset); } else { U32 const ll0 = (litLength == 0); offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0); ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); } DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); if (cctx->appliedParams.validateSequences) { seqPos->posInSrc += litLength + matchLength; FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), "Sequence validation failed"); } RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); ip += matchLength + litLength; } RETURN_ERROR_IF(idx == inSeqsSize, externalSequences_invalid, "Block delimiter not found."); /* If we skipped repcode search while parsing, we need to update repcodes now */ assert(externalRepSearch != ZSTD_ps_auto); assert(idx >= startIdx); if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) { U32* const rep = updatedRepcodes.rep; U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */ if (lastSeqIdx >= startIdx + 2) { rep[2] = inSeqs[lastSeqIdx - 2].offset; rep[1] = inSeqs[lastSeqIdx - 1].offset; rep[0] = inSeqs[lastSeqIdx].offset; } else if (lastSeqIdx == startIdx + 1) { rep[2] = rep[0]; rep[1] = inSeqs[lastSeqIdx - 1].offset; rep[0] = inSeqs[lastSeqIdx].offset; } else { assert(lastSeqIdx == startIdx); rep[2] = rep[1]; rep[1] = rep[0]; rep[0] = inSeqs[lastSeqIdx].offset; } } ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); if (inSeqs[idx].litLength) { DEBUGLOG(6, "Storing last literals of size: %u", inSeqs[idx].litLength); ZSTD_storeLastLiterals(&cctx->seqStore, ip, inSeqs[idx].litLength); ip += inSeqs[idx].litLength; seqPos->posInSrc += inSeqs[idx].litLength; } RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); seqPos->idx = idx+1; return blockSize; } /* * This function attempts to scan through @blockSize bytes in @src * represented by the sequences in @inSeqs, * storing any (partial) sequences. * * Occasionally, we may want to reduce the actual number of bytes consumed from @src * to avoid splitting a match, notably if it would produce a match smaller than MINMATCH. * * @returns the number of bytes consumed from @src, necessarily <= @blockSize. * Otherwise, it may return a ZSTD error if something went wrong. */ static size_t ZSTD_transferSequences_noDelim(ZSTD_CCtx* cctx, ZSTD_SequencePosition* seqPos, const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, const void* src, size_t blockSize, ZSTD_ParamSwitch_e externalRepSearch) { U32 idx = seqPos->idx; U32 startPosInSequence = seqPos->posInSequence; U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize; size_t dictSize; const BYTE* const istart = (const BYTE*)(src); const BYTE* ip = istart; const BYTE* iend = istart + blockSize; /* May be adjusted if we decide to process fewer than blockSize bytes */ Repcodes_t updatedRepcodes; U32 bytesAdjustment = 0; U32 finalMatchSplit = 0; /* TODO(embg) support fast parsing mode in noBlockDelim mode */ (void)externalRepSearch; if (cctx->cdict) { dictSize = cctx->cdict->dictContentSize; } else if (cctx->prefixDict.dict) { dictSize = cctx->prefixDict.dictSize; } else { dictSize = 0; } DEBUGLOG(5, "ZSTD_transferSequences_noDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { const ZSTD_Sequence currSeq = inSeqs[idx]; U32 litLength = currSeq.litLength; U32 matchLength = currSeq.matchLength; U32 const rawOffset = currSeq.offset; U32 offBase; /* Modify the sequence depending on where endPosInSequence lies */ if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { if (startPosInSequence >= litLength) { startPosInSequence -= litLength; litLength = 0; matchLength -= startPosInSequence; } else { litLength -= startPosInSequence; } /* Move to the next sequence */ endPosInSequence -= currSeq.litLength + currSeq.matchLength; startPosInSequence = 0; } else { /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence does not reach the end of the match. So, we have to split the sequence */ DEBUGLOG(6, "Require a split: diff: %u, idx: %u PIS: %u", currSeq.litLength + currSeq.matchLength - endPosInSequence, idx, endPosInSequence); if (endPosInSequence > litLength) { U32 firstHalfMatchLength; litLength = startPosInSequence >= litLength ? 0 : litLength - startPosInSequence; firstHalfMatchLength = endPosInSequence - startPosInSequence - litLength; if (matchLength > blockSize && firstHalfMatchLength >= cctx->appliedParams.cParams.minMatch) { /* Only ever split the match if it is larger than the block size */ U32 secondHalfMatchLength = currSeq.matchLength + currSeq.litLength - endPosInSequence; if (secondHalfMatchLength < cctx->appliedParams.cParams.minMatch) { /* Move the endPosInSequence backward so that it creates match of minMatch length */ endPosInSequence -= cctx->appliedParams.cParams.minMatch - secondHalfMatchLength; bytesAdjustment = cctx->appliedParams.cParams.minMatch - secondHalfMatchLength; firstHalfMatchLength -= bytesAdjustment; } matchLength = firstHalfMatchLength; /* Flag that we split the last match - after storing the sequence, exit the loop, but keep the value of endPosInSequence */ finalMatchSplit = 1; } else { /* Move the position in sequence backwards so that we don't split match, and break to store * the last literals. We use the original currSeq.litLength as a marker for where endPosInSequence * should go. We prefer to do this whenever it is not necessary to split the match, or if doing so * would cause the first half of the match to be too small */ bytesAdjustment = endPosInSequence - currSeq.litLength; endPosInSequence = currSeq.litLength; break; } } else { /* This sequence ends inside the literals, break to store the last literals */ break; } } /* Check if this offset can be represented with a repcode */ { U32 const ll0 = (litLength == 0); offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0); ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); } if (cctx->appliedParams.validateSequences) { seqPos->posInSrc += litLength + matchLength; FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), "Sequence validation failed"); } DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); ip += matchLength + litLength; if (!finalMatchSplit) idx++; /* Next Sequence */ } DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); seqPos->idx = idx; seqPos->posInSequence = endPosInSequence; ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); iend -= bytesAdjustment; if (ip != iend) { /* Store any last literals */ U32 const lastLLSize = (U32)(iend - ip); assert(ip <= iend); DEBUGLOG(6, "Storing last literals of size: %u", lastLLSize); ZSTD_storeLastLiterals(&cctx->seqStore, ip, lastLLSize); seqPos->posInSrc += lastLLSize; } return (size_t)(iend-istart); } /* @seqPos represents a position within @inSeqs, * it is read and updated by this function, * once the goal to produce a block of size @blockSize is reached. * @return: nb of bytes consumed from @src, necessarily <= @blockSize. */ typedef size_t (*ZSTD_SequenceCopier_f)(ZSTD_CCtx* cctx, ZSTD_SequencePosition* seqPos, const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, const void* src, size_t blockSize, ZSTD_ParamSwitch_e externalRepSearch); static ZSTD_SequenceCopier_f ZSTD_selectSequenceCopier(ZSTD_SequenceFormat_e mode) { assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, (int)mode)); if (mode == ZSTD_sf_explicitBlockDelimiters) { return ZSTD_transferSequences_wBlockDelim; } assert(mode == ZSTD_sf_noBlockDelimiters); return ZSTD_transferSequences_noDelim; } /* Discover the size of next block by searching for the delimiter. * Note that a block delimiter **must** exist in this mode, * otherwise it's an input error. * The block size retrieved will be later compared to ensure it remains within bounds */ static size_t blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_SequencePosition seqPos) { int end = 0; size_t blockSize = 0; size_t spos = seqPos.idx; DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize); assert(spos <= inSeqsSize); while (spos < inSeqsSize) { end = (inSeqs[spos].offset == 0); blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength; if (end) { if (inSeqs[spos].matchLength != 0) RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0"); break; } spos++; } if (!end) RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter"); return blockSize; } static size_t determine_blockSize(ZSTD_SequenceFormat_e mode, size_t blockSize, size_t remaining, const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_SequencePosition seqPos) { DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); if (mode == ZSTD_sf_noBlockDelimiters) { /* Note: more a "target" block size */ return MIN(remaining, blockSize); } assert(mode == ZSTD_sf_explicitBlockDelimiters); { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); if (explicitBlockSize > blockSize) RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block"); if (explicitBlockSize > remaining) RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source"); return explicitBlockSize; } } /* Compress all provided sequences, block-by-block. * * Returns the cumulative size of all compressed blocks (including their headers), * otherwise a ZSTD error. */ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const ZSTD_Sequence* inSeqs, size_t inSeqsSize, const void* src, size_t srcSize) { size_t cSize = 0; size_t remaining = srcSize; ZSTD_SequencePosition seqPos = {0, 0, 0}; const BYTE* ip = (BYTE const*)src; BYTE* op = (BYTE*)dst; ZSTD_SequenceCopier_f const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize); /* Special case: empty frame */ if (remaining == 0) { U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1); RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "No room for empty frame block header"); MEM_writeLE32(op, cBlockHeader24); op += ZSTD_blockHeaderSize; dstCapacity -= ZSTD_blockHeaderSize; cSize += ZSTD_blockHeaderSize; } while (remaining) { size_t compressedSeqsSize; size_t cBlockSize; size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, cctx->blockSizeMax, remaining, inSeqs, inSeqsSize, seqPos); U32 const lastBlock = (blockSize == remaining); FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); assert(blockSize <= remaining); ZSTD_resetSeqStore(&cctx->seqStore); blockSize = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes); FORWARD_IF_ERROR(blockSize, "Bad sequence copy"); /* If blocks are too small, emit as a nocompress block */ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding * additional 1. We need to revisit and change this logic to be more consistent */ if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); DEBUGLOG(5, "Block too small (%zu): data remains uncompressed: cSize=%zu", blockSize, cBlockSize); cSize += cBlockSize; ip += blockSize; op += cBlockSize; remaining -= blockSize; dstCapacity -= cBlockSize; continue; } RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, &cctx->appliedParams, op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, blockSize, cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, cctx->bmi2); FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); if (!cctx->isFirstBlock && ZSTD_maybeRLE(&cctx->seqStore) && ZSTD_isRLE(ip, blockSize)) { /* Note: don't emit the first block as RLE even if it qualifies because * doing so will cause the decoder (cli <= v1.4.3 only) to throw an (invalid) error * "should consume all input error." */ compressedSeqsSize = 1; } if (compressedSeqsSize == 0) { /* ZSTD_noCompressBlock writes the block header as well */ cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed"); DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize); } else if (compressedSeqsSize == 1) { cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock); FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed"); DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize); } else { U32 cBlockHeader; /* Error checking and repcodes update */ ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; /* Write block header into beginning of block*/ cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); MEM_writeLE24(op, cBlockHeader); cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); } cSize += cBlockSize; if (lastBlock) { break; } else { ip += blockSize; op += cBlockSize; remaining -= blockSize; dstCapacity -= cBlockSize; cctx->isFirstBlock = 0; } DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); } DEBUGLOG(4, "cSize final total: %zu", cSize); return cSize; } size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const ZSTD_Sequence* inSeqs, size_t inSeqsSize, const void* src, size_t srcSize) { BYTE* op = (BYTE*)dst; size_t cSize = 0; /* Transparent initialization stage, same as compressStream2() */ DEBUGLOG(4, "ZSTD_compressSequences (nbSeqs=%zu,dstCapacity=%zu)", inSeqsSize, dstCapacity); assert(cctx != NULL); FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); /* Begin writing output, starting with frame header */ { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, &cctx->appliedParams, srcSize, cctx->dictID); op += frameHeaderSize; assert(frameHeaderSize <= dstCapacity); dstCapacity -= frameHeaderSize; cSize += frameHeaderSize; } if (cctx->appliedParams.fParams.checksumFlag && srcSize) { xxh64_update(&cctx->xxhState, src, srcSize); } /* Now generate compressed blocks */ { size_t const cBlocksSize = ZSTD_compressSequences_internal(cctx, op, dstCapacity, inSeqs, inSeqsSize, src, srcSize); FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); cSize += cBlocksSize; assert(cBlocksSize <= dstCapacity); dstCapacity -= cBlocksSize; } /* Complete with frame checksum, if needed */ if (cctx->appliedParams.fParams.checksumFlag) { U32 const checksum = (U32) xxh64_digest(&cctx->xxhState); RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum"); DEBUGLOG(4, "Write checksum : %08X", (unsigned)checksum); MEM_writeLE32((char*)dst + cSize, checksum); cSize += 4; } DEBUGLOG(4, "Final compressed size: %zu", cSize); return cSize; } #if defined(__AVX2__) #include <immintrin.h> /* AVX2 intrinsics */ /* * Convert 2 sequences per iteration, using AVX2 intrinsics: * - offset -> offBase = offset + 2 * - litLength -> (U16) litLength * - matchLength -> (U16)(matchLength - 3) * - rep is ignored * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]). * * At the end, instead of extracting two __m128i, * we use _mm256_permute4x64_epi64(..., 0xE8) to move lane2 into lane1, * then store the lower 16 bytes in one go. * * @returns 0 on succes, with no long length detected * @returns > 0 if there is one long length (> 65535), * indicating the position, and type. */ static size_t convertSequences_noRepcodes( SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, size_t nbSequences) { /* * addition: * For each 128-bit half: (offset+2, litLength+0, matchLength-3, rep+0) */ const __m256i addition = _mm256_setr_epi32( ZSTD_REP_NUM, 0, -MINMATCH, 0, /* for sequence i */ ZSTD_REP_NUM, 0, -MINMATCH, 0 /* for sequence i+1 */ ); /* limit: check if there is a long length */ const __m256i limit = _mm256_set1_epi32(65535); /* * shuffle mask for byte-level rearrangement in each 128-bit half: * * Input layout (after addition) per 128-bit half: * [ offset+2 (4 bytes) | litLength (4 bytes) | matchLength (4 bytes) | rep (4 bytes) ] * We only need: * offBase (4 bytes) = offset+2 * litLength (2 bytes) = low 2 bytes of litLength * mlBase (2 bytes) = low 2 bytes of (matchLength) * => Bytes [0..3, 4..5, 8..9], zero the rest. */ const __m256i mask = _mm256_setr_epi8( /* For the lower 128 bits => sequence i */ 0, 1, 2, 3, /* offset+2 */ 4, 5, /* litLength (16 bits) */ 8, 9, /* matchLength (16 bits) */ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, /* For the upper 128 bits => sequence i+1 */ 16,17,18,19, /* offset+2 */ 20,21, /* litLength */ 24,25, /* matchLength */ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80 ); /* * Next, we'll use _mm256_permute4x64_epi64(vshf, 0xE8). * Explanation of 0xE8 = 11101000b => [lane0, lane2, lane2, lane3]. * So the lower 128 bits become [lane0, lane2] => combining seq0 and seq1. */ #define PERM_LANE_0X_E8 0xE8 /* [0,2,2,3] in lane indices */ size_t longLen = 0, i = 0; /* AVX permutation depends on the specific definition of target structures */ ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0); ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4); ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8); ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0); ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4); ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6); /* Process 2 sequences per loop iteration */ for (; i + 1 < nbSequences; i += 2) { /* Load 2 ZSTD_Sequence (32 bytes) */ __m256i vin = _mm256_loadu_si256((const __m256i*)(const void*)&inSeqs[i]); /* Add {2, 0, -3, 0} in each 128-bit half */ __m256i vadd = _mm256_add_epi32(vin, addition); /* Check for long length */ __m256i ll_cmp = _mm256_cmpgt_epi32(vadd, limit); /* 0xFFFFFFFF for element > 65535 */ int ll_res = _mm256_movemask_epi8(ll_cmp); /* Shuffle bytes so each half gives us the 8 bytes we need */ __m256i vshf = _mm256_shuffle_epi8(vadd, mask); /* * Now: * Lane0 = seq0's 8 bytes * Lane1 = 0 * Lane2 = seq1's 8 bytes * Lane3 = 0 */ /* Permute 64-bit lanes => move Lane2 down into Lane1. */ __m256i vperm = _mm256_permute4x64_epi64(vshf, PERM_LANE_0X_E8); /* * Now the lower 16 bytes (Lane0+Lane1) = [seq0, seq1]. * The upper 16 bytes are [Lane2, Lane3] = [seq1, 0], but we won't use them. */ /* Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */ _mm_storeu_si128((__m128i *)(void*)&dstSeqs[i], _mm256_castsi256_si128(vperm)); /* * This writes out 16 bytes total: * - offset 0..7 => seq0 (offBase, litLength, mlBase) * - offset 8..15 => seq1 (offBase, litLength, mlBase) */ /* check (unlikely) long lengths > 65535 * indices for lengths correspond to bits [4..7], [8..11], [20..23], [24..27] * => combined mask = 0x0FF00FF0 */ if (UNLIKELY((ll_res & 0x0FF00FF0) != 0)) { /* long length detected: let's figure out which one*/ if (inSeqs[i].matchLength > 65535+MINMATCH) { assert(longLen == 0); longLen = i + 1; } if (inSeqs[i].litLength > 65535) { assert(longLen == 0); longLen = i + nbSequences + 1; } if (inSeqs[i+1].matchLength > 65535+MINMATCH) { assert(longLen == 0); longLen = i + 1 + 1; } if (inSeqs[i+1].litLength > 65535) { assert(longLen == 0); longLen = i + 1 + nbSequences + 1; } } } /* Handle leftover if @nbSequences is odd */ if (i < nbSequences) { /* process last sequence */ assert(i == nbSequences - 1); dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); dstSeqs[i].litLength = (U16)inSeqs[i].litLength; dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); /* check (unlikely) long lengths > 65535 */ if (UNLIKELY(inSeqs[i].matchLength > 65535+MINMATCH)) { assert(longLen == 0); longLen = i + 1; } if (UNLIKELY(inSeqs[i].litLength > 65535)) { assert(longLen == 0); longLen = i + nbSequences + 1; } } return longLen; } /* the vector implementation could also be ported to SSSE3, * but since this implementation is targeting modern systems (>= Sapphire Rapid), * it's not useful to develop and maintain code for older pre-AVX2 platforms */ #else /* no AVX2 */ static size_t convertSequences_noRepcodes( SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, size_t nbSequences) { size_t longLen = 0; size_t n; for (n=0; n<nbSequences; n++) { dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset); dstSeqs[n].litLength = (U16)inSeqs[n].litLength; dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH); /* check for long length > 65535 */ if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) { assert(longLen == 0); longLen = n + 1; } if (UNLIKELY(inSeqs[n].litLength > 65535)) { assert(longLen == 0); longLen = n + nbSequences + 1; } } return longLen; } #endif /* * Precondition: Sequences must end on an explicit Block Delimiter * @return: 0 on success, or an error code. * Note: Sequence validation functionality has been disabled (removed). * This is helpful to generate a lean main pipeline, improving performance. * It may be re-inserted later. */ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, const ZSTD_Sequence* const inSeqs, size_t nbSequences, int repcodeResolution) { Repcodes_t updatedRepcodes; size_t seqNb = 0; DEBUGLOG(5, "ZSTD_convertBlockSequences (nbSequences = %zu)", nbSequences); RETURN_ERROR_IF(nbSequences >= cctx->seqStore.maxNbSeq, externalSequences_invalid, "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); /* check end condition */ assert(nbSequences >= 1); assert(inSeqs[nbSequences-1].matchLength == 0); assert(inSeqs[nbSequences-1].offset == 0); /* Convert Sequences from public format to internal format */ if (!repcodeResolution) { size_t const longl = convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences-1); cctx->seqStore.sequences = cctx->seqStore.sequencesStart + nbSequences-1; if (longl) { DEBUGLOG(5, "long length"); assert(cctx->seqStore.longLengthType == ZSTD_llt_none); if (longl <= nbSequences-1) { DEBUGLOG(5, "long match length detected at pos %zu", longl-1); cctx->seqStore.longLengthType = ZSTD_llt_matchLength; cctx->seqStore.longLengthPos = (U32)(longl-1); } else { DEBUGLOG(5, "long literals length detected at pos %zu", longl-nbSequences); assert(longl <= 2* (nbSequences-1)); cctx->seqStore.longLengthType = ZSTD_llt_literalLength; cctx->seqStore.longLengthPos = (U32)(longl-(nbSequences-1)-1); } } } else { for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { U32 const litLength = inSeqs[seqNb].litLength; U32 const matchLength = inSeqs[seqNb].matchLength; U32 const ll0 = (litLength == 0); U32 const offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength); ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); } } /* If we skipped repcode search while parsing, we need to update repcodes now */ if (!repcodeResolution && nbSequences > 1) { U32* const rep = updatedRepcodes.rep; if (nbSequences >= 4) { U32 lastSeqIdx = (U32)nbSequences - 2; /* index of last full sequence */ rep[2] = inSeqs[lastSeqIdx - 2].offset; rep[1] = inSeqs[lastSeqIdx - 1].offset; rep[0] = inSeqs[lastSeqIdx].offset; } else if (nbSequences == 3) { rep[2] = rep[0]; rep[1] = inSeqs[0].offset; rep[0] = inSeqs[1].offset; } else { assert(nbSequences == 2); rep[2] = rep[1]; rep[1] = rep[0]; rep[0] = inSeqs[0].offset; } } ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); return 0; } #if defined(ZSTD_ARCH_X86_AVX2) BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) { size_t i; __m256i const zeroVec = _mm256_setzero_si256(); __m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */ ZSTD_ALIGNED(32) U32 tmp[8]; /* temporary buffer for reduction */ size_t mSum = 0, lSum = 0; ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); /* Process 2 structs (32 bytes) at a time */ for (i = 0; i + 2 <= nbSeqs; i += 2) { /* Load two consecutive ZSTD_Sequence (8×4 = 32 bytes) */ __m256i data = _mm256_loadu_si256((const __m256i*)(const void*)&seqs[i]); /* check end of block signal */ __m256i cmp = _mm256_cmpeq_epi32(data, zeroVec); int cmp_res = _mm256_movemask_epi8(cmp); /* indices for match lengths correspond to bits [8..11], [24..27] * => combined mask = 0x0F000F00 */ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); if (cmp_res & 0x0F000F00) break; /* Accumulate in sumVec */ sumVec = _mm256_add_epi32(sumVec, data); } /* Horizontal reduction */ _mm256_store_si256((__m256i*)tmp, sumVec); lSum = tmp[1] + tmp[5]; mSum = tmp[2] + tmp[6]; /* Handle the leftover */ for (; i < nbSeqs; i++) { lSum += seqs[i].litLength; mSum += seqs[i].matchLength; if (seqs[i].matchLength == 0) break; /* end of block */ } if (i==nbSeqs) { /* reaching end of sequences: end of block signal was not present */ BlockSummary bs; bs.nbSequences = ERROR(externalSequences_invalid); return bs; } { BlockSummary bs; bs.nbSequences = i+1; bs.blockSize = lSum + mSum; bs.litSize = lSum; return bs; } } #else BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) { size_t totalMatchSize = 0; size_t litSize = 0; size_t n; assert(seqs); for (n=0; n<nbSeqs; n++) { totalMatchSize += seqs[n].matchLength; litSize += seqs[n].litLength; if (seqs[n].matchLength == 0) { assert(seqs[n].offset == 0); break; } } if (n==nbSeqs) { BlockSummary bs; bs.nbSequences = ERROR(externalSequences_invalid); return bs; } { BlockSummary bs; bs.nbSequences = n+1; bs.blockSize = litSize + totalMatchSize; bs.litSize = litSize; return bs; } } #endif static size_t ZSTD_compressSequencesAndLiterals_internal(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const ZSTD_Sequence* inSeqs, size_t nbSequences, const void* literals, size_t litSize, size_t srcSize) { size_t remaining = srcSize; size_t cSize = 0; BYTE* op = (BYTE*)dst; int const repcodeResolution = (cctx->appliedParams.searchForExternalRepcodes == ZSTD_ps_enable); assert(cctx->appliedParams.searchForExternalRepcodes != ZSTD_ps_auto); DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals_internal: nbSeqs=%zu, litSize=%zu", nbSequences, litSize); RETURN_ERROR_IF(nbSequences == 0, externalSequences_invalid, "Requires at least 1 end-of-block"); /* Special case: empty frame */ if ((nbSequences == 1) && (inSeqs[0].litLength == 0)) { U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1); RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "No room for empty frame block header"); MEM_writeLE24(op, cBlockHeader24); op += ZSTD_blockHeaderSize; dstCapacity -= ZSTD_blockHeaderSize; cSize += ZSTD_blockHeaderSize; } while (nbSequences) { size_t compressedSeqsSize, cBlockSize, conversionStatus; BlockSummary const block = ZSTD_get1BlockSummary(inSeqs, nbSequences); U32 const lastBlock = (block.nbSequences == nbSequences); FORWARD_IF_ERROR(block.nbSequences, "Error while trying to determine nb of sequences for a block"); assert(block.nbSequences <= nbSequences); RETURN_ERROR_IF(block.litSize > litSize, externalSequences_invalid, "discrepancy: Sequences require more literals than present in buffer"); ZSTD_resetSeqStore(&cctx->seqStore); conversionStatus = ZSTD_convertBlockSequences(cctx, inSeqs, block.nbSequences, repcodeResolution); FORWARD_IF_ERROR(conversionStatus, "Bad sequence conversion"); inSeqs += block.nbSequences; nbSequences -= block.nbSequences; remaining -= block.blockSize; /* Note: when blockSize is very small, other variant send it uncompressed. * Here, we still send the sequences, because we don't have the original source to send it uncompressed. * One could imagine in theory reproducing the source from the sequences, * but that's complex and costly memory intensive, and goes against the objectives of this variant. */ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); compressedSeqsSize = ZSTD_entropyCompressSeqStore_internal( op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, literals, block.litSize, &cctx->seqStore, &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, &cctx->appliedParams, cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, cctx->bmi2); FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); /* note: the spec forbids for any compressed block to be larger than maximum block size */ if (compressedSeqsSize > cctx->blockSizeMax) compressedSeqsSize = 0; DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); litSize -= block.litSize; literals = (const char*)literals + block.litSize; /* Note: difficult to check source for RLE block when only Literals are provided, * but it could be considered from analyzing the sequence directly */ if (compressedSeqsSize == 0) { /* Sending uncompressed blocks is out of reach, because the source is not provided. * In theory, one could use the sequences to regenerate the source, like a decompressor, * but it's complex, and memory hungry, killing the purpose of this variant. * Current outcome: generate an error code. */ RETURN_ERROR(cannotProduce_uncompressedBlock, "ZSTD_compressSequencesAndLiterals cannot generate an uncompressed block"); } else { U32 cBlockHeader; assert(compressedSeqsSize > 1); /* no RLE */ /* Error checking and repcodes update */ ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; /* Write block header into beginning of block*/ cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); MEM_writeLE24(op, cBlockHeader); cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); } cSize += cBlockSize; op += cBlockSize; dstCapacity -= cBlockSize; cctx->isFirstBlock = 0; DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); if (lastBlock) { assert(nbSequences == 0); break; } } RETURN_ERROR_IF(litSize != 0, externalSequences_invalid, "literals must be entirely and exactly consumed"); RETURN_ERROR_IF(remaining != 0, externalSequences_invalid, "Sequences must represent a total of exactly srcSize=%zu", srcSize); DEBUGLOG(4, "cSize final total: %zu", cSize); return cSize; } size_t ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const ZSTD_Sequence* inSeqs, size_t inSeqsSize, const void* literals, size_t litSize, size_t litCapacity, size_t decompressedSize) { BYTE* op = (BYTE*)dst; size_t cSize = 0; /* Transparent initialization stage, same as compressStream2() */ DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals (dstCapacity=%zu)", dstCapacity); assert(cctx != NULL); if (litCapacity < litSize) { RETURN_ERROR(workSpace_tooSmall, "literals buffer is not large enough: must be at least 8 bytes larger than litSize (risk of read out-of-bound)"); } FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, decompressedSize), "CCtx initialization failed"); if (cctx->appliedParams.blockDelimiters == ZSTD_sf_noBlockDelimiters) { RETURN_ERROR(frameParameter_unsupported, "This mode is only compatible with explicit delimiters"); } if (cctx->appliedParams.validateSequences) { RETURN_ERROR(parameter_unsupported, "This mode is not compatible with Sequence validation"); } if (cctx->appliedParams.fParams.checksumFlag) { RETURN_ERROR(frameParameter_unsupported, "this mode is not compatible with frame checksum"); } /* Begin writing output, starting with frame header */ { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, &cctx->appliedParams, decompressedSize, cctx->dictID); op += frameHeaderSize; assert(frameHeaderSize <= dstCapacity); dstCapacity -= frameHeaderSize; cSize += frameHeaderSize; } /* Now generate compressed blocks */ { size_t const cBlocksSize = ZSTD_compressSequencesAndLiterals_internal(cctx, op, dstCapacity, inSeqs, inSeqsSize, literals, litSize, decompressedSize); FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); cSize += cBlocksSize; assert(cBlocksSize <= dstCapacity); dstCapacity -= cBlocksSize; } DEBUGLOG(4, "Final compressed size: %zu", cSize); return cSize; } /*====== Finalize ======*/ static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) { const ZSTD_inBuffer nullInput = { NULL, 0, 0 }; const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable); return stableInput ? zcs->expectedInBuffer : nullInput; } /*! ZSTD_flushStream() : * @return : amount of data remaining to flush */ size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) { ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); input.size = input.pos; /* do not ingest more input during flush */ return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); } size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) { ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed"); if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ /* single thread mode : attempt to calculate remaining to flush more precisely */ { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; size_t const checksumSize = (size_t)(zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4); size_t const toFlush = remainingToFlush + lastBlockSize + checksumSize; DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (unsigned)toFlush); return toFlush; } } /*-===== Pre-defined compression levels =====-*/ #include "clevels.h" int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; } int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; } int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; } static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize) { ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, 0, dictSize, ZSTD_cpm_createCDict); switch (cParams.strategy) { case ZSTD_fast: case ZSTD_dfast: break; case ZSTD_greedy: case ZSTD_lazy: case ZSTD_lazy2: cParams.hashLog += ZSTD_LAZY_DDSS_BUCKET_LOG; break; case ZSTD_btlazy2: case ZSTD_btopt: case ZSTD_btultra: case ZSTD_btultra2: break; } return cParams; } static int ZSTD_dedicatedDictSearch_isSupported( ZSTD_compressionParameters const* cParams) { return (cParams->strategy >= ZSTD_greedy) && (cParams->strategy <= ZSTD_lazy2) && (cParams->hashLog > cParams->chainLog) && (cParams->chainLog <= 24); } /* * Reverses the adjustment applied to cparams when enabling dedicated dict * search. This is used to recover the params set to be used in the working * context. (Otherwise, those tables would also grow.) */ static void ZSTD_dedicatedDictSearch_revertCParams( ZSTD_compressionParameters* cParams) { switch (cParams->strategy) { case ZSTD_fast: case ZSTD_dfast: break; case ZSTD_greedy: case ZSTD_lazy: case ZSTD_lazy2: cParams->hashLog -= ZSTD_LAZY_DDSS_BUCKET_LOG; if (cParams->hashLog < ZSTD_HASHLOG_MIN) { cParams->hashLog = ZSTD_HASHLOG_MIN; } break; case ZSTD_btlazy2: case ZSTD_btopt: case ZSTD_btultra: case ZSTD_btultra2: break; } } static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) { switch (mode) { case ZSTD_cpm_unknown: case ZSTD_cpm_noAttachDict: case ZSTD_cpm_createCDict: break; case ZSTD_cpm_attachDict: dictSize = 0; break; default: assert(0); break; } { int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN; size_t const addedSize = unknown && dictSize > 0 ? 500 : 0; return unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize; } } /*! ZSTD_getCParams_internal() : * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. * Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown. * Use dictSize == 0 for unknown or unused. * Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_CParamMode_e`. */ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) { U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode); U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); int row; DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel); /* row */ if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT; /* 0 == default */ else if (compressionLevel < 0) row = 0; /* entry 0 is baseline for fast mode */ else if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL; else row = compressionLevel; { ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row]; DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u strat: %u", tableID, row, (U32)cp.strategy); /* acceleration factor */ if (compressionLevel < 0) { int const clampedCompressionLevel = MAX(ZSTD_minCLevel(), compressionLevel); cp.targetLength = (unsigned)(-clampedCompressionLevel); } /* refine parameters based on srcSize & dictSize */ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto); } } /*! ZSTD_getCParams() : * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. * Size values are optional, provide 0 if not known or unused */ ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); } /*! ZSTD_getParams() : * same idea as ZSTD_getCParams() * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). * Fields of `ZSTD_frameParameters` are set to default values */ static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) { ZSTD_parameters params; ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode); DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel); ZSTD_memset(¶ms, 0, sizeof(params)); params.cParams = cParams; params.fParams.contentSizeFlag = 1; return params; } /*! ZSTD_getParams() : * same idea as ZSTD_getCParams() * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). * Fields of `ZSTD_frameParameters` are set to default values */ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); } void ZSTD_registerSequenceProducer( ZSTD_CCtx* zc, void* extSeqProdState, ZSTD_sequenceProducer_F extSeqProdFunc) { assert(zc != NULL); ZSTD_CCtxParams_registerSequenceProducer( &zc->requestedParams, extSeqProdState, extSeqProdFunc ); } void ZSTD_CCtxParams_registerSequenceProducer( ZSTD_CCtx_params* params, void* extSeqProdState, ZSTD_sequenceProducer_F extSeqProdFunc) { assert(params != NULL); if (extSeqProdFunc != NULL) { params->extSeqProdFunc = extSeqProdFunc; params->extSeqProdState = extSeqProdState; } else { params->extSeqProdFunc = NULL; params->extSeqProdState = NULL; } } |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 | // SPDX-License-Identifier: GPL-2.0 /* * thermal_helpers.c - helper functions to handle thermal devices * * Copyright (C) 2016 Eduardo Valentin <edubezval@gmail.com> * * Highly based on original thermal_core.c * Copyright (C) 2008 Intel Corp * Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com> * Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/device.h> #include <linux/err.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/sysfs.h> #include "thermal_core.h" #include "thermal_trace.h" int get_tz_trend(struct thermal_zone_device *tz, const struct thermal_trip *trip) { enum thermal_trend trend; if (tz->emul_temperature || !tz->ops.get_trend || tz->ops.get_trend(tz, trip, &trend)) { if (tz->temperature > tz->last_temperature) trend = THERMAL_TREND_RAISING; else if (tz->temperature < tz->last_temperature) trend = THERMAL_TREND_DROPPING; else trend = THERMAL_TREND_STABLE; } return trend; } static bool thermal_instance_present(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev, const struct thermal_trip *trip) { const struct thermal_trip_desc *td = trip_to_trip_desc(trip); struct thermal_instance *ti; list_for_each_entry(ti, &td->thermal_instances, trip_node) { if (ti->cdev == cdev) return true; } return false; } bool thermal_trip_is_bound_to_cdev(struct thermal_zone_device *tz, const struct thermal_trip *trip, struct thermal_cooling_device *cdev) { guard(thermal_zone)(tz); guard(cooling_dev)(cdev); return thermal_instance_present(tz, cdev, trip); } EXPORT_SYMBOL_GPL(thermal_trip_is_bound_to_cdev); /** * __thermal_zone_get_temp() - returns the temperature of a thermal zone * @tz: a valid pointer to a struct thermal_zone_device * @temp: a valid pointer to where to store the resulting temperature. * * When a valid thermal zone reference is passed, it will fetch its * temperature and fill @temp. * * Both tz and tz->ops must be valid pointers when calling this function, * and the tz->ops.get_temp callback must be provided. * The function must be called under tz->lock. * * Return: On success returns 0, an error code otherwise */ int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp) { const struct thermal_trip_desc *td; int crit_temp = INT_MAX; int ret = -EINVAL; lockdep_assert_held(&tz->lock); ret = tz->ops.get_temp(tz, temp); if (IS_ENABLED(CONFIG_THERMAL_EMULATION) && tz->emul_temperature) { for_each_trip_desc(tz, td) { const struct thermal_trip *trip = &td->trip; if (trip->type == THERMAL_TRIP_CRITICAL) { crit_temp = trip->temperature; break; } } /* * Only allow emulating a temperature when the real temperature * is below the critical temperature so that the emulation code * cannot hide critical conditions. */ if (!ret && *temp < crit_temp) *temp = tz->emul_temperature; } if (ret) dev_dbg(&tz->device, "Failed to get temperature: %d\n", ret); return ret; } /** * thermal_zone_get_temp() - returns the temperature of a thermal zone * @tz: a valid pointer to a struct thermal_zone_device * @temp: a valid pointer to where to store the resulting temperature. * * When a valid thermal zone reference is passed, it will fetch its * temperature and fill @temp. * * Return: On success returns 0, an error code otherwise */ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp) { int ret; if (IS_ERR_OR_NULL(tz)) return -EINVAL; guard(thermal_zone)(tz); if (!tz->ops.get_temp) return -EINVAL; ret = __thermal_zone_get_temp(tz, temp); if (!ret && *temp <= THERMAL_TEMP_INVALID) return -ENODATA; return ret; } EXPORT_SYMBOL_GPL(thermal_zone_get_temp); static int thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev, int state) { int ret; /* * No check is needed for the ops->set_cur_state as the * registering function checked the ops are correctly set */ ret = cdev->ops->set_cur_state(cdev, state); if (ret) return ret; thermal_notify_cdev_state_update(cdev, state); thermal_cooling_device_stats_update(cdev, state); thermal_debug_cdev_state_update(cdev, state); return 0; } void __thermal_cdev_update(struct thermal_cooling_device *cdev) { struct thermal_instance *instance; unsigned long target = 0; /* Make sure cdev enters the deepest cooling state */ list_for_each_entry(instance, &cdev->thermal_instances, cdev_node) { if (instance->target == THERMAL_NO_TARGET) continue; if (instance->target > target) target = instance->target; } thermal_cdev_set_cur_state(cdev, target); trace_cdev_update(cdev, target); dev_dbg(&cdev->device, "set to state %lu\n", target); } /** * thermal_cdev_update - update cooling device state if needed * @cdev: pointer to struct thermal_cooling_device * * Update the cooling device state if there is a need. */ void thermal_cdev_update(struct thermal_cooling_device *cdev) { guard(cooling_dev)(cdev); if (!cdev->updated) { __thermal_cdev_update(cdev); cdev->updated = true; } } /** * thermal_cdev_update_nocheck() - Unconditionally update cooling device state * @cdev: Target cooling device. */ void thermal_cdev_update_nocheck(struct thermal_cooling_device *cdev) { guard(cooling_dev)(cdev); __thermal_cdev_update(cdev); } /** * thermal_zone_get_slope - return the slope attribute of the thermal zone * @tz: thermal zone device with the slope attribute * * Return: If the thermal zone device has a slope attribute, return it, else * return 1. */ int thermal_zone_get_slope(struct thermal_zone_device *tz) { if (tz && tz->tzp) return tz->tzp->slope; return 1; } EXPORT_SYMBOL_GPL(thermal_zone_get_slope); /** * thermal_zone_get_offset - return the offset attribute of the thermal zone * @tz: thermal zone device with the offset attribute * * Return: If the thermal zone device has a offset attribute, return it, else * return 0. */ int thermal_zone_get_offset(struct thermal_zone_device *tz) { if (tz && tz->tzp) return tz->tzp->offset; return 0; } EXPORT_SYMBOL_GPL(thermal_zone_get_offset); |
| 1128 464 467 465 4 676 73 435 466 1 3 8 1 3 7 1 1 70 19 7 24 8 4 4 4 256 257 257 255 2 257 30 745 17 776 779 778 894 897 896 461 463 463 462 463 462 301 778 781 779 777 95 732 145 671 762 4 4 4 4 4 4 4 4 4 237 240 240 36 240 766 768 768 764 765 3 758 3 762 763 765 4 757 760 566 3 141 669 755 1 700 65 760 756 759 4 2 6 752 13 2 1 14 13 1 12 12 10 2 6 4 3 4 4 779 778 3 779 2 1 3 470 471 471 468 471 470 470 471 7 7 2 7 9 236 243 292 245 55 12 12 10 271 272 67 245 293 1 20 1 96 98 1 96 4 9 6 3 2 7 1 21 2 51 8 26 5 20 1 10 1 30 1 29 1 27 2 1 9 22 22 3 19 1 9 11 21 11 11 11 11 11 21 10 85 11 96 2 57 40 98 80 6 6 6 6 471 583 586 3 471 470 472 472 216 219 219 193 14 672 672 670 671 670 375 374 373 41 3 25 13 9 9 1 1 3 4 1 1 1 1 1 1 30 30 1 1 1 23 4 1 21 1 1 1 17 6 1 2 2 1 1 16 1 1 8 1 3 9 2 7 4 26 1 26 26 2 2 1 1 1 1 5 2 1 1 1 1 1 1 1 1 26 19 2 2 1 1 1 26 26 26 26 26 1 8 3 21 26 21 6 1 1 1 3 3 3 1 2 2 1 1 31 25 5 16 27 4 27 26 26 15 25 26 511 3 1 12 3 502 2 510 508 4 2 2 2 2 8 1 2 16 8 8 8 8 8 8 7 8 10 10 10 1 8 4 2 1 4 1 5 1 2 19 1 18 19 5 2 17 27 2 26 28 11 18 19 19 3 19 17 10 8 18 19 1 4 4 1 1 1 8 8 1 1 4 2 1 17 17 1 4 1 1 1 1 1 1 1 13 13 13 13 13 13 8 8 8 5 7 2 4 3 6 10 1 1 1 5 1 8 4 3 12 10 13 13 13 13 13 13 2 8 5 13 12 501 500 502 501 500 697 699 673 656 675 698 401 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic address resolution entity * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. * Harald Welte Add neighbour cache statistics like rtstat */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/times.h> #include <net/net_namespace.h> #include <net/neighbour.h> #include <net/arp.h> #include <net/dst.h> #include <net/sock.h> #include <net/netevent.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/string.h> #include <linux/log2.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <trace/events/neigh.h> #define NEIGH_DEBUG 1 #define neigh_dbg(level, fmt, ...) \ do { \ if (level <= NEIGH_DEBUG) \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, struct net_device *dev); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; #endif static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) { int i; switch (family) { default: DEBUG_NET_WARN_ON_ONCE(1); fallthrough; /* to avoid panic by null-ptr-deref */ case AF_INET: i = NEIGH_ARP_TABLE; break; case AF_INET6: i = NEIGH_ND_TABLE; break; } return &dev->neighbours[i]; } /* Neighbour hash table buckets are protected with rwlock tbl->lock. - All the scans/updates to hash buckets MUST be made under this lock. - NOTHING clever should be made under this lock: no callbacks to protocol backends, no attempts to send something to network. It will result in deadlocks, if backend/driver wants to use neighbour cache. - If the entry requires some non-trivial actions, increase its reference count and release table lock. Neighbour entries are protected: - with reference count. - with rwlock neigh->lock Reference count prevents destruction. neigh->lock mainly serializes ll address data and its validity state. However, the same lock is used to protect another entry fields: - timer - resolution queue Again, nothing clever shall be made under neigh->lock, the most complicated procedure, which we allow is dev->hard_header. It is supposed, that dev->hard_header is simplistic and does not make callbacks to neighbour tables. */ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); return -ENETDOWN; } static void neigh_cleanup_and_release(struct neighbour *neigh) { trace_neigh_cleanup_and_release(neigh, 0); __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } /* * It is random distribution in the interval (1/2)*base...(3/2)*base. * It corresponds to default IPv6 settings and is not overridable, * because it is really reasonable choice. */ unsigned long neigh_rand_reach_time(unsigned long base) { return base ? get_random_u32_below(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); static void neigh_mark_dead(struct neighbour *n) { n->dead = 1; if (!list_empty(&n->gc_list)) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } if (!list_empty(&n->managed_list)) list_del_init(&n->managed_list); } static void neigh_update_gc_list(struct neighbour *n) { bool on_gc_list, exempt_from_gc; write_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; /* remove from the gc list if new state is permanent or if neighbor * is externally learned; otherwise entry should be on the gc list */ exempt_from_gc = n->nud_state & NUD_PERMANENT || n->flags & NTF_EXT_LEARNED; on_gc_list = !list_empty(&n->gc_list); if (exempt_from_gc && on_gc_list) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } else if (!exempt_from_gc && !on_gc_list) { /* add entries to the tail; cleaning removes from the front */ list_add_tail(&n->gc_list, &n->tbl->gc_list); atomic_inc(&n->tbl->gc_entries); } out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } static void neigh_update_managed_list(struct neighbour *n) { bool on_managed_list, add_to_managed; write_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; add_to_managed = n->flags & NTF_MANAGED; on_managed_list = !list_empty(&n->managed_list); if (!add_to_managed && on_managed_list) list_del_init(&n->managed_list); else if (add_to_managed && !on_managed_list) list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, bool *gc_update, bool *managed_update) { u32 ndm_flags, old_flags = neigh->flags; if (!(flags & NEIGH_UPDATE_F_ADMIN)) return; ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) neigh->flags |= NTF_EXT_LEARNED; else neigh->flags &= ~NTF_EXT_LEARNED; *notify = 1; *gc_update = true; } if ((old_flags ^ ndm_flags) & NTF_MANAGED) { if (ndm_flags & NTF_MANAGED) neigh->flags |= NTF_MANAGED; else neigh->flags &= ~NTF_MANAGED; *notify = 1; *managed_update = true; } } bool neigh_remove_one(struct neighbour *n) { bool retval = false; write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); if (retval) neigh_cleanup_and_release(n); return retval; } static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - READ_ONCE(tbl->gc_thresh2); u64 tmax = ktime_get_ns() + NSEC_PER_MSEC; unsigned long tref = jiffies - 5 * HZ; struct neighbour *n, *tmp; int shrunk = 0; int loop = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { if (refcount_read(&n->refcnt) == 1) { bool remove = false; write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || !time_in_range(n->updated, tref, jiffies)) remove = true; write_unlock(&n->lock); if (remove && neigh_remove_one(n)) shrunk++; if (shrunk >= max_clean) break; if (++loop == 16) { if (ktime_get_ns() > tmax) goto unlock; loop = 0; } } } WRITE_ONCE(tbl->last_flush, jiffies); unlock: write_unlock_bh(&tbl->lock); return shrunk; } static void neigh_add_timer(struct neighbour *n, unsigned long when) { /* Use safe distance from the jiffies - LONG_MAX point while timer * is running in DELAY/PROBE state but still show to user space * large times in the past. */ unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); neigh_hold(n); if (!time_in_range(n->confirmed, mint, jiffies)) n->confirmed = mint; if (time_before(n->used, n->confirmed)) n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); dump_stack(); } } static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && timer_delete(&n->timer)) { neigh_release(n); return 1; } return 0; } static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, int family) { switch (family) { case AF_INET: return __in_dev_arp_parms_get_rcu(dev); case AF_INET6: return __in6_dev_nd_parms_get_rcu(dev); } return NULL; } static void neigh_parms_qlen_dec(struct net_device *dev, int family) { struct neigh_parms *p; rcu_read_lock(); p = neigh_get_dev_parms_rcu(dev, family); if (p) p->qlen--; rcu_read_unlock(); } static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, int family) { struct sk_buff_head tmp; unsigned long flags; struct sk_buff *skb; skb_queue_head_init(&tmp); spin_lock_irqsave(&list->lock, flags); skb = skb_peek(list); while (skb != NULL) { struct sk_buff *skb_next = skb_peek_next(skb, list); struct net_device *dev = skb->dev; if (net == NULL || net_eq(dev_net(dev), net)) { neigh_parms_qlen_dec(dev, family); __skb_unlink(skb, list); __skb_queue_tail(&tmp, skb); } skb = skb_next; } spin_unlock_irqrestore(&list->lock, flags); while ((skb = __skb_dequeue(&tmp))) { dev_put(skb->dev); kfree_skb(skb); } } static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { struct hlist_head *dev_head; struct hlist_node *tmp; struct neighbour *n; dev_head = neigh_get_dev_table(dev, tbl->family); hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { if (skip_perm && n->nud_state & NUD_PERMANENT) continue; hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); write_lock(&n->lock); neigh_del_timer(n); neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. * We must destroy neighbour entry, * but someone still uses it. * * The destroy will be delayed until * the last user releases us, but * we must kill timers etc. and move * it to safe state. */ __skb_queue_purge(&n->arp_queue); n->arp_queue_len_bytes = 0; WRITE_ONCE(n->output, neigh_blackhole); if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; else n->nud_state = NUD_NONE; neigh_dbg(2, "neigh %p is stray\n", n); } write_unlock(&n->lock); neigh_cleanup_and_release(n); } } void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, false); write_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) timer_delete_sync(&tbl->proxy_timer); return 0; } int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev) { __neigh_ifdown(tbl, dev, true); return 0; } EXPORT_SYMBOL(neigh_carrier_down); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) { __neigh_ifdown(tbl, dev, false); return 0; } EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, u32 flags, bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries, gc_thresh3; if (exempt_from_gc) goto do_alloc; entries = atomic_inc_return(&tbl->gc_entries) - 1; gc_thresh3 = READ_ONCE(tbl->gc_thresh3); if (entries >= gc_thresh3 || (entries >= READ_ONCE(tbl->gc_thresh2) && time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) { if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) { net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id); NEIGH_CACHE_STAT_INC(tbl, table_fulls); goto out_entries; } } do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; __skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; n->flags = flags; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); timer_setup(&n->timer, neigh_timer_handler, 0); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; INIT_LIST_HEAD(&n->gc_list); INIT_LIST_HEAD(&n->managed_list); atomic_inc(&tbl->entries); out: return n; out_entries: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); goto out; } static void neigh_get_hash_rnd(u32 *x) { *x = get_random_u32() | 1; } static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { size_t size = (1 << shift) * sizeof(struct hlist_head); struct hlist_head *hash_heads; struct neigh_hash_table *ret; int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; hash_heads = kzalloc(size, GFP_ATOMIC); if (!hash_heads) { kfree(ret); return NULL; } ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); return ret; } static void neigh_hash_free_rcu(struct rcu_head *head) { struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); kfree(nht->hash_heads); kfree(nht); } static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, unsigned long new_shift) { unsigned int i, hash; struct neigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); old_nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); new_nht = neigh_hash_alloc(new_shift); if (!new_nht) return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } } rcu_assign_pointer(tbl->nht, new_nht); call_rcu(&old_nht->rcu, neigh_hash_free_rcu); return new_nht; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n; NEIGH_CACHE_STAT_INC(tbl, lookups); rcu_read_lock(); n = __neigh_lookup_noref(tbl, pkey, dev); if (n) { if (!refcount_inc_not_zero(&n->refcnt)) n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); } rcu_read_unlock(); return n; } EXPORT_SYMBOL(neigh_lookup); static struct neighbour * ___neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, u32 flags, bool exempt_from_gc, bool want_ref) { u32 hash_val, key_len = tbl->key_len; struct neighbour *n1, *rc, *n; struct neigh_hash_table *nht; int error; n = neigh_alloc(tbl, dev, flags, exempt_from_gc); trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; } memcpy(n->primary_key, pkey, key_len); n->dev = dev; netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC); /* Protocol specific setup. */ if (tbl->constructor && (error = tbl->constructor(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } if (dev->netdev_ops->ndo_neigh_construct) { error = dev->netdev_ops->ndo_neigh_construct(dev, n); if (error < 0) { rc = ERR_PTR(error); goto out_neigh_release; } } /* Device specific setup. */ if (n->parms->neigh_setup && (error = n->parms->neigh_setup(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) { if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } n->dead = 0; if (!exempt_from_gc) list_add_tail(&n->gc_list, &n->tbl->gc_list); if (n->flags & NTF_MANAGED) list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); hlist_add_head_rcu(&n->dev_list, neigh_get_dev_table(dev, tbl->family)); write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: write_unlock_bh(&tbl->lock); out_neigh_release: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); neigh_release(n); goto out; } struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); } EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) { u32 hash_val = *(u32 *)(pkey + key_len - 4); hash_val ^= (hash_val >> 16); hash_val ^= hash_val >> 8; hash_val ^= hash_val >> 4; hash_val &= PNEIGH_HASHMASK; return hash_val; } static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, struct net *net, const void *pkey, unsigned int key_len, struct net_device *dev) { while (n) { if (!memcmp(n->key, pkey, key_len) && net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; n = n->next; } return NULL; } struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); return __pneigh_lookup_1(tbl->phash_buckets[hash_val], net, pkey, key_len, dev); } EXPORT_SYMBOL_GPL(__pneigh_lookup); struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev, int creat) { struct pneigh_entry *n; unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); read_lock_bh(&tbl->lock); n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); if (n || !creat) goto out; ASSERT_RTNL(); n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; netdev_hold(dev, &n->dev_tracker, GFP_KERNEL); if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); n = NULL; goto out; } write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; tbl->phash_buckets[hash_val] = n; write_unlock_bh(&tbl->lock); out: return n; } EXPORT_SYMBOL(pneigh_lookup); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, **np; unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { *np = n->next; write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); netdev_put(n->dev, &n->dev_tracker); kfree(n); return 0; } } write_unlock_bh(&tbl->lock); return -ENOENT; } static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, struct net_device *dev) { struct pneigh_entry *n, **np, *freelist = NULL; u32 h; for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; while ((n = *np) != NULL) { if (!dev || n->dev == dev) { *np = n->next; n->next = freelist; freelist = n; continue; } np = &n->next; } } write_unlock_bh(&tbl->lock); while ((n = freelist)) { freelist = n->next; n->next = NULL; if (tbl->pdestructor) tbl->pdestructor(n); netdev_put(n->dev, &n->dev_tracker); kfree(n); } return -ENOENT; } static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) kfree(parms); } /* * neighbour must already be out of the table; * */ void neigh_destroy(struct neighbour *neigh) { struct net_device *dev = neigh->dev; NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); if (!neigh->dead) { pr_warn("Destroying alive neighbour %p\n", neigh); dump_stack(); return; } if (neigh_del_timer(neigh)) pr_warn("Impossible event\n"); write_lock_bh(&neigh->lock); __skb_queue_purge(&neigh->arp_queue); write_unlock_bh(&neigh->lock); neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) dev->netdev_ops->ndo_neigh_destroy(dev, neigh); netdev_put(dev, &neigh->dev_tracker); neigh_parms_put(neigh->parms); neigh_dbg(2, "neigh %p is destroyed\n", neigh); atomic_dec(&neigh->tbl->entries); kfree_rcu(neigh, rcu); } EXPORT_SYMBOL(neigh_destroy); /* Neighbour state is suspicious; disable fast path. Called with write_locked neigh. */ static void neigh_suspect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is suspected\n", neigh); WRITE_ONCE(neigh->output, neigh->ops->output); } /* Neighbour state is OK; enable fast path. Called with write_locked neigh. */ static void neigh_connect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is connected\n", neigh); WRITE_ONCE(neigh->output, neigh->ops->connected_output); } static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); struct neigh_hash_table *nht; struct hlist_node *tmp; struct neighbour *n; unsigned int i; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); /* * periodically recompute ReachableTime from random function */ if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; WRITE_ONCE(tbl->last_rand, jiffies); list_for_each_entry(p, &tbl->parms_list, list) p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; write_lock(&n->lock); state = n->nud_state; if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || (n->flags & NTF_EXT_LEARNED)) { write_unlock(&n->lock); continue; } if (time_before(n->used, n->confirmed) && time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); } /* * It's fine to release lock here, even if hash table * grows while we are preempted. */ write_unlock_bh(&tbl->lock); cond_resched(); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } out: /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 * BASE_REACHABLE_TIME. */ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); write_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) : NEIGH_VAR(p, MCAST_PROBES)); } static void neigh_invalidate(struct neighbour *neigh) __releases(neigh->lock) __acquires(neigh->lock) { struct sk_buff *skb; NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); neigh_dbg(2, "neigh %p is failed\n", neigh); neigh->updated = jiffies; /* It is very thin place. report_unreachable is very complicated routine. Particularly, it can hit the same neighbour entry! So that, we try to be accurate and avoid dead loop. --ANK */ while (neigh->nud_state == NUD_FAILED && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { write_unlock(&neigh->lock); neigh->ops->error_report(neigh, skb); write_lock(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); if (neigh->ops->solicit) neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); consume_skb(skb); } /* Called when a timer expires for a neighbour entry. */ static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; struct neighbour *neigh = from_timer(neigh, t, timer); unsigned int state; int notify = 0; write_lock(&neigh->lock); state = neigh->nud_state; now = jiffies; next = now + HZ; if (!(state & NUD_IN_TIMER)) goto out; if (state & NUD_REACHABLE) { if (time_before_eq(now, neigh->confirmed + neigh->parms->reachable_time)) { neigh_dbg(2, "neigh %p is still alive\n", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } else if (time_before_eq(now, neigh->used + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is delayed\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_suspect(neigh); next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); } else { neigh_dbg(2, "neigh %p is suspected\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_STALE); neigh->updated = jiffies; neigh_suspect(neigh); notify = 1; } } else if (state & NUD_DELAY) { if (time_before_eq(now, neigh->confirmed + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is now reachable\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_REACHABLE); neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { neigh_dbg(2, "neigh %p is probed\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_PROBE); neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { WRITE_ONCE(neigh->nud_state, NUD_FAILED); notify = 1; neigh_invalidate(neigh); goto out; } if (neigh->nud_state & NUD_IN_TIMER) { if (time_before(next, jiffies + HZ/100)) next = jiffies + HZ/100; if (!mod_timer(&neigh->timer, next)) neigh_hold(neigh); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { neigh_probe(neigh); } else { out: write_unlock(&neigh->lock); } if (notify) neigh_update_notify(neigh, 0); trace_neigh_timer_handler(neigh, 0); neigh_release(neigh); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok) { int rc; bool immediate_probe = false; write_lock_bh(&neigh->lock); rc = 0; if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; if (neigh->dead) goto out_dead; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + NEIGH_VAR(neigh->parms, APP_PROBES)) { unsigned long next, now = jiffies; atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); neigh->updated = now; if (!immediate_ok) { next = now + 1; } else { immediate_probe = true; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ / 100); } neigh_add_timer(neigh, next); } else { WRITE_ONCE(neigh->nud_state, NUD_FAILED); neigh->updated = jiffies; write_unlock_bh(&neigh->lock); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); return 1; } } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh_del_timer(neigh); WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { while (neigh->arp_queue_len_bytes + skb->truesize > NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { struct sk_buff *buff; buff = __skb_dequeue(&neigh->arp_queue); if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); neigh->arp_queue_len_bytes += skb->truesize; } rc = 1; } out_unlock_bh: if (immediate_probe) neigh_probe(neigh); else write_unlock(&neigh->lock); local_bh_enable(); trace_neigh_event_send_done(neigh, rc); return rc; out_dead: if (neigh->nud_state & NUD_STALE) goto out_unlock_bh; write_unlock_bh(&neigh->lock); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD); trace_neigh_event_send_dead(neigh, 1); return 1; } EXPORT_SYMBOL(__neigh_event_send); static void neigh_update_hhs(struct neighbour *neigh) { struct hh_cache *hh; void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) = NULL; if (neigh->dev->header_ops) update = neigh->dev->header_ops->cache_update; if (update) { hh = &neigh->hh; if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); } } } /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. -- flags NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, if it is different. NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" lladdr instead of overriding it if it is different. NEIGH_UPDATE_F_ADMIN means that the change is administrative. NEIGH_UPDATE_F_USE means that the entry is user triggered. NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. Caller MUST hold reference count on the entry. */ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid, struct netlink_ext_ack *extack) { bool gc_update = false, managed_update = false; int update_isrouter = 0; struct net_device *dev; int err, notify = 0; u8 old; trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); write_lock_bh(&neigh->lock); dev = neigh->dev; old = neigh->nud_state; err = -EPERM; if (neigh->dead) { NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); new = old; goto out; } if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; neigh_update_flags(neigh, flags, ¬ify, &gc_update, &managed_update); if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) { new = old & ~NUD_PERMANENT; WRITE_ONCE(neigh->nud_state, new); err = 0; goto out; } if (!(new & NUD_VALID)) { neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); WRITE_ONCE(neigh->nud_state, new); err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; } goto out; } /* Compare new lladdr with cached one */ if (!dev->addr_len) { /* First case: device needs no address. */ lladdr = neigh->ha; } else if (lladdr) { /* The second case: if something is already cached and a new address is proposed: - compare new & old - if they are different, check override flag */ if ((old & NUD_VALID) && !memcmp(lladdr, neigh->ha, dev->addr_len)) lladdr = neigh->ha; } else { /* No address is supplied; if we know something, use it, otherwise discard the request. */ err = -EINVAL; if (!(old & NUD_VALID)) { NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; } lladdr = neigh->ha; } /* Update confirmed timestamp for neighbour entry after we * received ARP packet even if it doesn't change IP to MAC binding. */ if (new & NUD_CONNECTED) neigh->confirmed = jiffies; /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ err = 0; update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; if (old & NUD_VALID) { if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { update_isrouter = 0; if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && (old & NUD_CONNECTED)) { lladdr = neigh->ha; new = NUD_STALE; } else goto out; } else { if (lladdr == neigh->ha && new == NUD_STALE && !(flags & NEIGH_UPDATE_F_ADMIN)) new = old; } } /* Update timestamp only once we know we will make a change to the * neighbour entry. Otherwise we risk to move the locktime window with * noop updates and ignore relevant ARP updates. */ if (new != old || lladdr != neigh->ha) neigh->updated = jiffies; if (new != old) { neigh_del_timer(neigh); if (new & NUD_PROBE) atomic_set(&neigh->probes, 0); if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); WRITE_ONCE(neigh->nud_state, new); notify = 1; } if (lladdr != neigh->ha) { write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); notify = 1; } if (new == old) goto out; if (new & NUD_CONNECTED) neigh_connect(neigh); else neigh_suspect(neigh); if (!(old & NUD_VALID)) { struct sk_buff *skb; /* Again: avoid dead loop if something went wrong */ while (neigh->nud_state & NUD_VALID && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { struct dst_entry *dst = skb_dst(skb); struct neighbour *n2, *n1 = neigh; write_unlock_bh(&neigh->lock); rcu_read_lock(); /* Why not just use 'neigh' as-is? The problem is that * things such as shaper, eql, and sch_teql can end up * using alternative, different, neigh objects to output * the packet in the output path. So what we need to do * here is re-lookup the top-level neigh in the path so * we can reinject the packet there. */ n2 = NULL; if (dst && dst->obsolete != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; } READ_ONCE(n1->output)(n1, skb); if (n2) neigh_release(n2); rcu_read_unlock(); write_lock_bh(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } out: if (update_isrouter) neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); if (((new ^ old) & NUD_PERMANENT) || gc_update) neigh_update_gc_list(neigh); if (managed_update) neigh_update_managed_list(neigh); if (notify) neigh_update_notify(neigh, nlmsg_pid); trace_neigh_update_done(neigh, err); return err; } int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid) { return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); } EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. */ void __neigh_set_probe_once(struct neighbour *neigh) { if (neigh->dead) return; neigh->updated = jiffies; if (!(neigh->nud_state & NUD_FAILED)) return; WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100)); } EXPORT_SYMBOL(__neigh_set_probe_once); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev) { struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); if (neigh) neigh_update(neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_OVERRIDE, 0); return neigh; } EXPORT_SYMBOL(neigh_event_ns); /* called with read_lock_bh(&n->lock); */ static void neigh_hh_init(struct neighbour *n) { struct net_device *dev = n->dev; __be16 prot = n->tbl->protocol; struct hh_cache *hh = &n->hh; write_lock_bh(&n->lock); /* Only one thread can come in here and initialize the * hh_cache entry. */ if (!hh->hh_len) dev->header_ops->cache(n, hh, prot); write_unlock_bh(&n->lock); } /* Slow and careful. */ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { int rc = 0; if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; unsigned int seq; if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) rc = dev_queue_xmit(skb); else goto out_kfree_skb; } out: return rc; out_kfree_skb: rc = -EINVAL; kfree_skb(skb); goto out; } EXPORT_SYMBOL(neigh_resolve_output); /* As fast as possible without hh cache */ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) { struct net_device *dev = neigh->dev; unsigned int seq; int err; do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) err = dev_queue_xmit(skb); else { err = -EINVAL; kfree_skb(skb); } return err; } EXPORT_SYMBOL(neigh_connected_output); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) { return dev_queue_xmit(skb); } EXPORT_SYMBOL(neigh_direct_output); static void neigh_managed_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, managed_work.work); struct neighbour *neigh; write_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); write_unlock_bh(&tbl->lock); } static void neigh_proxy_process(struct timer_list *t) { struct neigh_table *tbl = from_timer(tbl, t, proxy_timer); long sched_next = 0; unsigned long now = jiffies; struct sk_buff *skb, *n; spin_lock(&tbl->proxy_queue.lock); skb_queue_walk_safe(&tbl->proxy_queue, skb, n) { long tdif = NEIGH_CB(skb)->sched_next - now; if (tdif <= 0) { struct net_device *dev = skb->dev; neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue); if (tbl->proxy_redo && netif_running(dev)) { rcu_read_lock(); tbl->proxy_redo(skb); rcu_read_unlock(); } else { kfree_skb(skb); } dev_put(dev); } else if (!sched_next || tdif < sched_next) sched_next = tdif; } timer_delete(&tbl->proxy_timer); if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); } static unsigned long neigh_proxy_delay(struct neigh_parms *p) { /* If proxy_delay is zero, do not call get_random_u32_below() * as it is undefined behavior. */ unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY); return proxy_delay ? jiffies + get_random_u32_below(proxy_delay) : jiffies; } void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { unsigned long sched_next = neigh_proxy_delay(p); if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); return; } NEIGH_CB(skb)->sched_next = sched_next; NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); if (timer_delete(&tbl->proxy_timer)) { if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } skb_dst_drop(skb); dev_hold(skb->dev); __skb_queue_tail(&tbl->proxy_queue, skb); p->qlen++; mod_timer(&tbl->proxy_timer, sched_next); spin_unlock(&tbl->proxy_queue.lock); } EXPORT_SYMBOL(pneigh_enqueue); static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, struct net *net, int ifindex) { struct neigh_parms *p; list_for_each_entry(p, &tbl->parms_list, list) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || (!p->dev && !ifindex && net_eq(net, &init_net))) return p; } return NULL; } struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl) { struct neigh_parms *p; struct net *net = dev_net(dev); const struct net_device_ops *ops = dev->netdev_ops; p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; refcount_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); p->qlen = 0; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; write_pnet(&p->net, net); p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { netdev_put(dev, &p->dev_tracker); kfree(p); return NULL; } write_lock_bh(&tbl->lock); list_add(&p->list, &tbl->parms.list); write_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); } return p; } EXPORT_SYMBOL(neigh_parms_alloc); static void neigh_rcu_free_parms(struct rcu_head *head) { struct neigh_parms *parms = container_of(head, struct neigh_parms, rcu_head); neigh_parms_put(parms); } void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { if (!parms || parms == &tbl->parms) return; write_lock_bh(&tbl->lock); list_del(&parms->list); parms->dead = 1; write_unlock_bh(&tbl->lock); netdev_put(parms->dev, &parms->dev_tracker); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); static struct lock_class_key neigh_table_proxy_queue_class; static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); INIT_LIST_HEAD(&tbl->gc_list); INIT_LIST_HEAD(&tbl->managed_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) panic("cannot create neighbour cache statistics"); #ifdef CONFIG_PROC_FS if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat, &neigh_stat_seq_ops, tbl)) panic("cannot create neighbour proc dir entry"); #endif RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); if (!tbl->entry_size) tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + tbl->key_len, NEIGH_PRIV_ALIGN); else WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, tbl->parms.reachable_time); INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0); timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); /* * Only called from ndisc_cleanup(), which means this is dead code * because we no longer can unload IPv6 module. */ int neigh_table_clear(int index, struct neigh_table *tbl) { RCU_INIT_POINTER(neigh_tables[index], NULL); synchronize_rcu(); /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); timer_delete_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, neigh_hash_free_rcu); tbl->nht = NULL; kfree(tbl->phash_buckets); tbl->phash_buckets = NULL; remove_proc_entry(tbl->id, init_net.proc_net_stat); free_percpu(tbl->stats); tbl->stats = NULL; return 0; } EXPORT_SYMBOL(neigh_table_clear); static struct neigh_table *neigh_find_table(int family) { struct neigh_table *tbl = NULL; switch (family) { case AF_INET: tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); break; case AF_INET6: tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); break; } return tbl; } const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, [NDA_PROBES] = { .type = NLA_U32 }, [NDA_VLAN] = { .type = NLA_U16 }, [NDA_PORT] = { .type = NLA_U16 }, [NDA_VNI] = { .type = NLA_U32 }, [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK), [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; struct neighbour *neigh; struct net_device *dev = NULL; int err = -EINVAL; ASSERT_RTNL(); if (nlmsg_len(nlh) < sizeof(*ndm)) goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); if (!dst_attr) { NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(dst_attr) < (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); goto out; } if (dev == NULL) goto out; neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); if (neigh == NULL) { err = -ENOENT; goto out; } err = __neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); out: return err; } static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_OVERRIDE_ISROUTER; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; struct neighbour *neigh; void *dst, *lladdr; u8 protocol = 0; u32 ndm_flags; int err; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, nda_policy, extack); if (err < 0) goto out; err = -EINVAL; if (!tb[NDA_DST]) { NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; } ndm = nlmsg_data(nlh); ndm_flags = ndm->ndm_flags; if (tb[NDA_FLAGS_EXT]) { u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]); BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE < (sizeof(ndm->ndm_flags) * BITS_PER_BYTE + hweight32(NTF_EXT_MASK))); ndm_flags |= (ext << NTF_EXT_SHIFT); } if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; } dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; if (ndm_flags & NTF_MANAGED) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } err = -ENOBUFS; pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { pn->flags = ndm_flags; if (protocol) pn->protocol = protocol; err = 0; } goto out; } if (!dev) { NL_SET_ERR_MSG(extack, "Device not specified"); goto out; } if (tbl->allow_add && !tbl->allow_add(dev, extack)) { err = -EINVAL; goto out; } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; bool exempt_from_gc = ndm_permanent || ndm_flags & NTF_EXT_LEARNED; if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } if (ndm_permanent && (ndm_flags & NTF_MANAGED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry"); err = -EINVAL; goto out; } neigh = ___neigh_create(tbl, dst, dev, ndm_flags & (NTF_EXT_LEARNED | NTF_MANAGED), exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; } } else { if (nlh->nlmsg_flags & NLM_F_EXCL) { err = -EEXIST; neigh_release(neigh); goto out; } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) flags &= ~(NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_OVERRIDE_ISROUTER); } if (protocol) neigh->protocol = protocol; if (ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; if (ndm_flags & NTF_ROUTER) flags |= NEIGH_UPDATE_F_ISROUTER; if (ndm_flags & NTF_MANAGED) flags |= NEIGH_UPDATE_F_MANAGED; if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) { neigh_event_send(neigh, NULL); err = 0; } neigh_release(neigh); out: return err; } static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, NDTA_PARMS); if (nest == NULL) return -ENOBUFS; if ((parms->dev && nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || /* approximative value for deprecated QUEUE_LEN (in packets) */ nla_put_u32(skb, NDTPA_QUEUE_LEN, NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || nla_put_u32(skb, NDTPA_UCAST_PROBES, NEIGH_VAR(parms, UCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_PROBES, NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time, NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_GC_STALETIME, NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_RETRANS_TIME, NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_PROXY_DELAY, NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_LOCKTIME, NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS, NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, u32 pid, u32 seq, int type, int flags) { struct nlmsghdr *nlh; struct ndtmsg *ndtmsg; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval), NDTA_PAD) || nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) || nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) || nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3))) goto nla_put_failure; { unsigned long now = jiffies; long flush_delta = now - READ_ONCE(tbl->last_flush); long rand_delta = now - READ_ONCE(tbl->last_rand); struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, .ndtc_entry_size = tbl->entry_size, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; rcu_read_lock(); nht = rcu_dereference(tbl->nht); ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); rcu_read_unlock(); if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) goto nla_put_failure; } { int cpu; struct ndt_stats ndst; memset(&ndst, 0, sizeof(ndst)); for_each_possible_cpu(cpu) { struct neigh_statistics *st; st = per_cpu_ptr(tbl->stats, cpu); ndst.ndts_allocs += READ_ONCE(st->allocs); ndst.ndts_destroys += READ_ONCE(st->destroys); ndst.ndts_hash_grows += READ_ONCE(st->hash_grows); ndst.ndts_res_failed += READ_ONCE(st->res_failed); ndst.ndts_lookups += READ_ONCE(st->lookups); ndst.ndts_hits += READ_ONCE(st->hits); ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast); ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast); ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs); ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs); ndst.ndts_table_fulls += READ_ONCE(st->table_fulls); } if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, NDTA_PAD)) goto nla_put_failure; } BUG_ON(tbl->parms.dev); if (neightbl_fill_parms(skb, &tbl->parms) < 0) goto nla_put_failure; read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; nla_put_failure: read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int neightbl_fill_param_info(struct sk_buff *skb, struct neigh_table *tbl, struct neigh_parms *parms, u32 pid, u32 seq, int type, unsigned int flags) { struct ndtmsg *ndtmsg; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 || neightbl_fill_parms(skb, parms) < 0) goto errout; read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; errout: read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { [NDTA_NAME] = { .type = NLA_STRING }, [NDTA_THRESH1] = { .type = NLA_U32 }, [NDTA_THRESH2] = { .type = NLA_U32 }, [NDTA_THRESH3] = { .type = NLA_U32 }, [NDTA_GC_INTERVAL] = { .type = NLA_U64 }, [NDTA_PARMS] = { .type = NLA_NESTED }, }; static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 }, [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, [NDTPA_RETRANS_TIME] = { .type = NLA_U64 }, [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, [NDTPA_LOCKTIME] = { .type = NLA_U64 }, [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 }, }; static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct neigh_table *tbl; struct ndtmsg *ndtmsg; struct nlattr *tb[NDTA_MAX+1]; bool found = false; int err, tidx; err = nlmsg_parse_deprecated(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, nl_neightbl_policy, extack); if (err < 0) goto errout; if (tb[NDTA_NAME] == NULL) { err = -EINVAL; goto errout; } ndtmsg = nlmsg_data(nlh); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { found = true; break; } } if (!found) return -ENOENT; /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ write_lock_bh(&tbl->lock); if (tb[NDTA_PARMS]) { struct nlattr *tbp[NDTPA_MAX+1]; struct neigh_parms *p; int i, ifindex = 0; err = nla_parse_nested_deprecated(tbp, NDTPA_MAX, tb[NDTA_PARMS], nl_ntbl_parm_policy, extack); if (err < 0) goto errout_tbl_lock; if (tbp[NDTPA_IFINDEX]) ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); p = lookup_neigh_parms(tbl, net, ifindex); if (p == NULL) { err = -ENOENT; goto errout_tbl_lock; } for (i = 1; i <= NDTPA_MAX; i++) { if (tbp[i] == NULL) continue; switch (i) { case NDTPA_QUEUE_LEN: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i]) * SKB_TRUESIZE(ETH_FRAME_LEN)); break; case NDTPA_QUEUE_LENBYTES: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i])); break; case NDTPA_PROXY_QLEN: NEIGH_VAR_SET(p, PROXY_QLEN, nla_get_u32(tbp[i])); break; case NDTPA_APP_PROBES: NEIGH_VAR_SET(p, APP_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_UCAST_PROBES: NEIGH_VAR_SET(p, UCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_PROBES: NEIGH_VAR_SET(p, MCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_REPROBES: NEIGH_VAR_SET(p, MCAST_REPROBES, nla_get_u32(tbp[i])); break; case NDTPA_BASE_REACHABLE_TIME: NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, nla_get_msecs(tbp[i])); /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it (can be multiple minutes) */ p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); break; case NDTPA_GC_STALETIME: NEIGH_VAR_SET(p, GC_STALETIME, nla_get_msecs(tbp[i])); break; case NDTPA_DELAY_PROBE_TIME: NEIGH_VAR_SET(p, DELAY_PROBE_TIME, nla_get_msecs(tbp[i])); call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; case NDTPA_INTERVAL_PROBE_TIME_MS: NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS, nla_get_msecs(tbp[i])); break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, nla_get_msecs(tbp[i])); break; case NDTPA_ANYCAST_DELAY: NEIGH_VAR_SET(p, ANYCAST_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_PROXY_DELAY: NEIGH_VAR_SET(p, PROXY_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_LOCKTIME: NEIGH_VAR_SET(p, LOCKTIME, nla_get_msecs(tbp[i])); break; } } } err = -ENOENT; if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] || tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) && !net_eq(net, &init_net)) goto errout_tbl_lock; if (tb[NDTA_THRESH1]) WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1])); if (tb[NDTA_THRESH2]) WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2])); if (tb[NDTA_THRESH3]) WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3])); if (tb[NDTA_GC_INTERVAL]) WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL])); err = 0; errout_tbl_lock: write_unlock_bh(&tbl->lock); errout: return err; } static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ndtmsg *ndtm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); return -EINVAL; } ndtm = nlmsg_data(nlh); if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ndtm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request"); return -EINVAL; } return 0; } static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; int neigh_skip = cb->args[1]; struct neigh_table *tbl; if (cb->strict_check) { int err = neightbl_valid_dump_info(nlh, cb->extack); if (err < 0) return err; } family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; if (tidx < tbl_skip || (family && tbl->family != family)) continue; if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) break; nidx = 0; p = list_next_entry(&tbl->parms, list); list_for_each_entry_from(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; if (nidx < neigh_skip) goto next; if (neightbl_fill_param_info(skb, tbl, p, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) goto out; next: nidx++; } neigh_skip = 0; } out: cb->args[0] = tidx; cb->args[1] = nidx; return skb->len; } static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, u32 pid, u32 seq, int type, unsigned int flags) { u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT; neigh_flags = neigh->flags & NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = neigh->ops->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = neigh_flags; ndm->ndm_type = neigh->type; ndm->ndm_ifindex = neigh->dev->ifindex; if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) goto nla_put_failure; read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; if (neigh->nud_state & NUD_VALID) { char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, neigh, neigh->dev); if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { read_unlock_bh(&neigh->lock); goto nla_put_failure; } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; read_unlock_bh(&neigh->lock); if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) { u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; neigh_flags = pn->flags & NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = neigh_flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) { call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); } static bool neigh_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another * invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) { if (filter_idx && (!dev || dev->ifindex != filter_idx)) return true; return false; } struct neigh_dump_filter { int master_idx; int dev_idx; }; static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb, struct neigh_dump_filter *filter) { struct net *net = sock_net(skb->sk); struct neighbour *n; int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; unsigned int flags = NLM_F_MULTI; if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; idx = 0; neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags); if (err < 0) goto out; next: idx++; } } out: cb->args[1] = h; cb->args[2] = idx; return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb, struct neigh_dump_filter *filter) { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; unsigned int flags = NLM_F_MULTI; if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; read_lock_bh(&tbl->lock); for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags, tbl); if (err < 0) { read_unlock_bh(&tbl->lock); goto out; } next: idx++; } } read_unlock_bh(&tbl->lock); out: cb->args[3] = h; cb->args[4] = idx; return err; } static int neigh_valid_dump_req(const struct nlmsghdr *nlh, bool strict_check, struct neigh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; int err, i; if (strict_check) { struct ndmsg *ndm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); return -EINVAL; } ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); return -EINVAL; } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); } else { err = nlmsg_parse_deprecated(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); } if (err < 0) return err; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; /* all new attributes should require strict_check */ switch (i) { case NDA_IFINDEX: filter->dev_idx = nla_get_u32(tb[i]); break; case NDA_MASTER: filter->master_idx = nla_get_u32(tb[i]); break; default: if (strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request"); return -EINVAL; } } } return 0; } static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct neigh_dump_filter filter = {}; struct neigh_table *tbl; int t, family, s_t; int proxy = 0; int err; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is * the same for both structures */ if (nlmsg_len(nlh) >= sizeof(struct ndmsg) && ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY) proxy = 1; err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); if (err < 0 && cb->strict_check) return err; err = 0; s_t = cb->args[0]; rcu_read_lock(); for (t = 0; t < NEIGH_NR_TABLES; t++) { tbl = rcu_dereference(neigh_tables[t]); if (!tbl) continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (proxy) err = pneigh_dump_table(tbl, skb, cb, &filter); else err = neigh_dump_table(tbl, skb, cb, &filter); if (err < 0) break; } rcu_read_unlock(); cb->args[0] = t; return err; } static int neigh_valid_get_req(const struct nlmsghdr *nlh, struct neigh_table **tbl, void **dst, int *dev_idx, u8 *ndm_flags, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); return -EINVAL; } ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); return -EINVAL; } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) return err; *ndm_flags = ndm->ndm_flags; *dev_idx = ndm->ndm_ifindex; *tbl = neigh_find_table(ndm->ndm_family); if (*tbl == NULL) { NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); return -EAFNOSUPPORT; } for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_DST: if (nla_len(tb[i]) != (int)(*tbl)->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); return -EINVAL; } *dst = nla_data(tb[i]); break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); return -EINVAL; } } return 0; } static inline size_t neigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(4) /* NDA_PROBES */ + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } static int neigh_get_reply(struct net *net, struct neighbour *neigh, u32 pid, u32 seq) { struct sk_buff *skb; int err = 0; skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); if (!skb) return -ENOBUFS; err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); if (err) { kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, pid); errout: return err; } static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, u32 pid, u32 seq, struct neigh_table *tbl) { struct sk_buff *skb; int err = 0; skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); if (!skb) return -ENOBUFS; err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); if (err) { kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, pid); errout: return err; } static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct net_device *dev = NULL; struct neigh_table *tbl = NULL; struct neighbour *neigh; void *dst = NULL; u8 ndm_flags = 0; int dev_idx = 0; int err; err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, extack); if (err < 0) return err; if (dev_idx) { dev = __dev_get_by_index(net, dev_idx); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); return -ENODEV; } } if (!dst) { NL_SET_ERR_MSG(extack, "Network address not specified"); return -EINVAL; } if (ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; pn = pneigh_lookup(tbl, net, dst, dev, 0); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); return -ENOENT; } return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, tbl); } if (!dev) { NL_SET_ERR_MSG(extack, "No device specified"); return -EINVAL; } neigh = neigh_lookup(tbl, dst, dev); if (!neigh) { NL_SET_ERR_MSG(extack, "Neighbour entry not found"); return -ENOENT; } err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); neigh_release(neigh); return err; } void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; struct neigh_hash_table *nht; rcu_read_lock(); nht = rcu_dereference(tbl->nht); read_lock_bh(&tbl->lock); /* avoid resizes */ for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } read_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_for_each); /* The tbl->lock must be held as a writer and BH disabled. */ void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { struct neigh_hash_table *nht; int chain; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); release = cb(n); if (release) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); } write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); } } } EXPORT_SYMBOL(__neigh_for_each_release); int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; rcu_read_lock(); tbl = rcu_dereference(neigh_tables[index]); if (!tbl) goto out_unlock; if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); neigh = __ipv4_neigh_lookup_noref(dev, key); } else { neigh = __neigh_lookup_noref(tbl, addr, dev); } if (!neigh) neigh = __neigh_create(tbl, addr, dev, false); err = PTR_ERR(neigh); if (IS_ERR(neigh)) { rcu_read_unlock(); goto out_kfree_skb; } err = READ_ONCE(neigh->output)(neigh, skb); out_unlock: rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { err = dev_hard_header(skb, dev, ntohs(skb->protocol), addr, NULL, skb->len); if (err < 0) goto out_kfree_skb; err = dev_queue_xmit(skb); } out: return err; out_kfree_skb: kfree_skb(skb); goto out; } EXPORT_SYMBOL(neigh_xmit); #ifdef CONFIG_PROC_FS static struct neighbour *neigh_get_valid(struct seq_file *seq, struct neighbour *n, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); if (!net_eq(dev_net(n->dev), net)) return NULL; if (state->neigh_sub_iter) { loff_t fakep = 0; void *v; v = state->neigh_sub_iter(state, n, pos ? pos : &fakep); if (!v) return NULL; if (pos) return v; } if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) return n; if (READ_ONCE(n->nud_state) & ~NUD_NOARP) return n; return NULL; } static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct neigh_hash_table *nht = state->nht; struct neighbour *n, *tmp; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; while (++state->bucket < (1 << nht->hash_shift)) { neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) { tmp = neigh_get_valid(seq, n, NULL); if (tmp) return tmp; } } return NULL; } static struct neighbour *neigh_get_next(struct seq_file *seq, struct neighbour *n, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct neighbour *tmp; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); if (v) return n; } hlist_for_each_entry_continue(n, hash) { tmp = neigh_get_valid(seq, n, pos); if (tmp) { n = tmp; goto out; } } n = neigh_get_first(seq); out: if (n && pos) --(*pos); return n; } static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) { struct neighbour *n = neigh_get_first(seq); if (n) { --(*pos); while (*pos) { n = neigh_get_next(seq, n, pos); if (!n) break; } } return *pos ? NULL : n; } static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; int bucket; state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { pn = tbl->phash_buckets[bucket]; while (pn && !net_eq(pneigh_net(pn), net)) pn = pn->next; if (pn) break; } state->bucket = bucket; return pn; } static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct pneigh_entry *pn, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; do { pn = pn->next; } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; pn = tbl->phash_buckets[state->bucket]; while (pn && !net_eq(pneigh_net(pn), net)) pn = pn->next; if (pn) break; } if (pn && pos) --(*pos); return pn; } static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) { struct pneigh_entry *pn = pneigh_get_first(seq); if (pn) { --(*pos); while (*pos) { pn = pneigh_get_next(seq, pn, pos); if (!pn) break; } } return *pos ? NULL : pn; } static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; void *rc; loff_t idxpos = *pos; rc = neigh_get_idx(seq, &idxpos); if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_idx(seq, &idxpos); return rc; } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) __acquires(tbl->lock) __acquires(rcu) { struct neigh_seq_state *state = seq->private; state->tbl = tbl; state->bucket = -1; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); rcu_read_lock(); state->nht = rcu_dereference(tbl->nht); read_lock_bh(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_seq_state *state; void *rc; if (v == SEQ_START_TOKEN) { rc = neigh_get_first(seq); goto out; } state = seq->private; if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { rc = neigh_get_next(seq, v, NULL); if (rc) goto out; if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_first(seq); } else { BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); rc = pneigh_get_next(seq, v, NULL); } out: ++(*pos); return rc; } EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) __releases(tbl->lock) __releases(rcu) { struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; read_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_seq_stop); /* statistics via seq_file */ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } return NULL; } static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } (*pos)++; return NULL; } static void neigh_stat_seq_stop(struct seq_file *seq, void *v) { } static int neigh_stat_seq_show(struct seq_file *seq, void *v) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " "%08lx %08lx %08lx " "%08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, st->destroys, st->hash_grows, st->lookups, st->hits, st->res_failed, st->rcv_probes_mcast, st->rcv_probes_ucast, st->periodic_gc_runs, st->forced_gc_runs, st->unres_discards, st->table_fulls ); return 0; } static const struct seq_operations neigh_stat_seq_ops = { .start = neigh_stat_seq_start, .next = neigh_stat_seq_next, .stop = neigh_stat_seq_stop, .show = neigh_stat_seq_show, }; #endif /* CONFIG_PROC_FS */ static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { struct sk_buff *skb; int err = -ENOBUFS; struct net *net; rcu_read_lock(); net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); goto out; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); out: rcu_read_unlock(); } void neigh_app_ns(struct neighbour *n) { __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = &unres_qlen_max; tmp.data = &size; size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); return ret; } static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) { struct net_device *dev; int family = neigh_parms_family(p); rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct neigh_parms *dst_p = neigh_get_dev_parms_rcu(dev, family); if (dst_p && !test_bit(index, dst_p->data_state)) dst_p->data[index] = p->data[index]; } rcu_read_unlock(); } static void neigh_proc_update(const struct ctl_table *ctl, int write) { struct net_device *dev = ctl->extra1; struct neigh_parms *p = ctl->extra2; struct net *net = neigh_parms_net(p); int index = (int *) ctl->data - p->data; if (!write) return; set_bit(index, p->data_state); if (index == NEIGH_VAR_DELAY_PROBE_TIME) call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = SYSCTL_INT_MAX; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; int min = msecs_to_jiffies(1); tmp.extra1 = &min; tmp.extra2 = NULL; ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec); int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0) { /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it */ p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } return ret; } #define NEIGH_PARMS_DATA_OFFSET(index) \ (&((struct neigh_parms *) 0)->data[index]) #define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ [NEIGH_VAR_ ## attr] = { \ .procname = name, \ .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ } #define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) #define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) #define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) #define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive) #define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) #define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen) static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS, "interval_probe_time_ms"), NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"), [NEIGH_VAR_GC_INTERVAL] = { .procname = "gc_interval", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NEIGH_VAR_GC_THRESH1] = { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, }, }; int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *handler) { int i; struct neigh_sysctl_table *t; const char *dev_name_source; char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; size_t neigh_vars_size; t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto err; for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { t->neigh_vars[i].data += (long) p; t->neigh_vars[i].extra1 = dev; t->neigh_vars[i].extra2 = p; } neigh_vars_size = ARRAY_SIZE(t->neigh_vars); if (dev) { dev_name_source = dev->name; /* Terminate the table early */ neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; dev_name_source = "default"; t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } if (handler) { /* RetransTime */ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; /* RetransTime (in milliseconds)*/ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; } else { /* Those handlers will update p->reachable_time after * base_reachable_time(_ms) is set to ensure the new timer starts being * applied after the next neighbour update instead of waiting for * neigh_periodic_work to update its value (can be multiple minutes) * So any handler that replaces them should do this as well */ /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = neigh_proc_base_reachable_time; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = neigh_proc_base_reachable_time; } switch (neigh_parms_family(p)) { case AF_INET: p_name = "ipv4"; break; case AF_INET6: p_name = "ipv6"; break; default: BUG(); } snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", p_name, dev_name_source); t->sysctl_header = register_net_sysctl_sz(neigh_parms_net(p), neigh_path, t->neigh_vars, neigh_vars_size); if (!t->sysctl_header) goto free; p->sysctl_table = t; return 0; free: kfree(t); err: return -ENOBUFS; } EXPORT_SYMBOL(neigh_sysctl_register); void neigh_sysctl_unregister(struct neigh_parms *p) { if (p->sysctl_table) { struct neigh_sysctl_table *t = p->sysctl_table; p->sysctl_table = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } } EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, }; static int __init neigh_init(void) { rtnl_register_many(neigh_rtnl_msg_handlers); return 0; } subsys_initcall(neigh_init); |
| 91 312 91 316 359 361 363 363 48 48 28 27 48 48 47 48 10 10 28 29 29 29 327 329 314 298 17 17 329 327 327 326 17 329 329 325 2 29 1 1 318 319 308 2 20 19 1 1 1 361 362 361 362 361 360 41 344 230 278 362 361 591 593 592 363 360 90 2 318 1 359 569 570 29 29 29 84 9 20 64 29 29 29 359 57 380 362 360 57 361 363 19 19 19 342 341 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 | // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_rmap.h" #include "xfs_refcount.h" #include "xfs_bmap.h" #include "xfs_alloc.h" #include "xfs_buf.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_exchmaps.h" static struct kmem_cache *xfs_defer_pending_cache; /* * Deferred Operations in XFS * * Due to the way locking rules work in XFS, certain transactions (block * mapping and unmapping, typically) have permanent reservations so that * we can roll the transaction to adhere to AG locking order rules and * to unlock buffers between metadata updates. Prior to rmap/reflink, * the mapping code had a mechanism to perform these deferrals for * extents that were going to be freed; this code makes that facility * more generic. * * When adding the reverse mapping and reflink features, it became * necessary to perform complex remapping multi-transactions to comply * with AG locking order rules, and to be able to spread a single * refcount update operation (an operation on an n-block extent can * update as many as n records!) among multiple transactions. XFS can * roll a transaction to facilitate this, but using this facility * requires us to log "intent" items in case log recovery needs to * redo the operation, and to log "done" items to indicate that redo * is not necessary. * * Deferred work is tracked in xfs_defer_pending items. Each pending * item tracks one type of deferred work. Incoming work items (which * have not yet had an intent logged) are attached to a pending item * on the dop_intake list, where they wait for the caller to finish * the deferred operations. * * Finishing a set of deferred operations is an involved process. To * start, we define "rolling a deferred-op transaction" as follows: * * > For each xfs_defer_pending item on the dop_intake list, * - Sort the work items in AG order. XFS locking * order rules require us to lock buffers in AG order. * - Create a log intent item for that type. * - Attach it to the pending item. * - Move the pending item from the dop_intake list to the * dop_pending list. * > Roll the transaction. * * NOTE: To avoid exceeding the transaction reservation, we limit the * number of items that we attach to a given xfs_defer_pending. * * The actual finishing process looks like this: * * > For each xfs_defer_pending in the dop_pending list, * - Roll the deferred-op transaction as above. * - Create a log done item for that type, and attach it to the * log intent item. * - For each work item attached to the log intent item, * * Perform the described action. * * Attach the work item to the log done item. * * If the result of doing the work was -EAGAIN, ->finish work * wants a new transaction. See the "Requesting a Fresh * Transaction while Finishing Deferred Work" section below for * details. * * The key here is that we must log an intent item for all pending * work items every time we roll the transaction, and that we must log * a done item as soon as the work is completed. With this mechanism * we can perform complex remapping operations, chaining intent items * as needed. * * Requesting a Fresh Transaction while Finishing Deferred Work * * If ->finish_item decides that it needs a fresh transaction to * finish the work, it must ask its caller (xfs_defer_finish) for a * continuation. The most likely cause of this circumstance are the * refcount adjust functions deciding that they've logged enough items * to be at risk of exceeding the transaction reservation. * * To get a fresh transaction, we want to log the existing log done * item to prevent the log intent item from replaying, immediately log * a new log intent item with the unfinished work items, roll the * transaction, and re-call ->finish_item wherever it left off. The * log done item and the new log intent item must be in the same * transaction or atomicity cannot be guaranteed; defer_finish ensures * that this happens. * * This requires some coordination between ->finish_item and * defer_finish. Upon deciding to request a new transaction, * ->finish_item should update the current work item to reflect the * unfinished work. Next, it should reset the log done item's list * count to the number of items finished, and return -EAGAIN. * defer_finish sees the -EAGAIN, logs the new log intent item * with the remaining work items, and leaves the xfs_defer_pending * item at the head of the dop_work queue. Then it rolls the * transaction and picks up processing where it left off. It is * required that ->finish_item must be careful to leave enough * transaction reservation to fit the new log intent item. * * This is an example of remapping the extent (E, E+B) into file X at * offset A and dealing with the extent (C, C+B) already being mapped * there: * +-------------------------------------------------+ * | Unmap file X startblock C offset A length B | t0 * | Intent to reduce refcount for extent (C, B) | * | Intent to remove rmap (X, C, A, B) | * | Intent to free extent (D, 1) (bmbt block) | * | Intent to map (X, A, B) at startblock E | * +-------------------------------------------------+ * | Map file X startblock E offset A length B | t1 * | Done mapping (X, E, A, B) | * | Intent to increase refcount for extent (E, B) | * | Intent to add rmap (X, E, A, B) | * +-------------------------------------------------+ * | Reduce refcount for extent (C, B) | t2 * | Done reducing refcount for extent (C, 9) | * | Intent to reduce refcount for extent (C+9, B-9) | * | (ran out of space after 9 refcount updates) | * +-------------------------------------------------+ * | Reduce refcount for extent (C+9, B+9) | t3 * | Done reducing refcount for extent (C+9, B-9) | * | Increase refcount for extent (E, B) | * | Done increasing refcount for extent (E, B) | * | Intent to free extent (C, B) | * | Intent to free extent (F, 1) (refcountbt block) | * | Intent to remove rmap (F, 1, REFC) | * +-------------------------------------------------+ * | Remove rmap (X, C, A, B) | t4 * | Done removing rmap (X, C, A, B) | * | Add rmap (X, E, A, B) | * | Done adding rmap (X, E, A, B) | * | Remove rmap (F, 1, REFC) | * | Done removing rmap (F, 1, REFC) | * +-------------------------------------------------+ * | Free extent (C, B) | t5 * | Done freeing extent (C, B) | * | Free extent (D, 1) | * | Done freeing extent (D, 1) | * | Free extent (F, 1) | * | Done freeing extent (F, 1) | * +-------------------------------------------------+ * * If we should crash before t2 commits, log recovery replays * the following intent items: * * - Intent to reduce refcount for extent (C, B) * - Intent to remove rmap (X, C, A, B) * - Intent to free extent (D, 1) (bmbt block) * - Intent to increase refcount for extent (E, B) * - Intent to add rmap (X, E, A, B) * * In the process of recovering, it should also generate and take care * of these intent items: * * - Intent to free extent (C, B) * - Intent to free extent (F, 1) (refcountbt block) * - Intent to remove rmap (F, 1, REFC) * * Note that the continuation requested between t2 and t3 is likely to * reoccur. */ STATIC struct xfs_log_item * xfs_defer_barrier_create_intent( struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort) { return NULL; } STATIC void xfs_defer_barrier_abort_intent( struct xfs_log_item *intent) { /* empty */ } STATIC struct xfs_log_item * xfs_defer_barrier_create_done( struct xfs_trans *tp, struct xfs_log_item *intent, unsigned int count) { return NULL; } STATIC int xfs_defer_barrier_finish_item( struct xfs_trans *tp, struct xfs_log_item *done, struct list_head *item, struct xfs_btree_cur **state) { ASSERT(0); return -EFSCORRUPTED; } STATIC void xfs_defer_barrier_cancel_item( struct list_head *item) { ASSERT(0); } static const struct xfs_defer_op_type xfs_barrier_defer_type = { .max_items = 1, .create_intent = xfs_defer_barrier_create_intent, .abort_intent = xfs_defer_barrier_abort_intent, .create_done = xfs_defer_barrier_create_done, .finish_item = xfs_defer_barrier_finish_item, .cancel_item = xfs_defer_barrier_cancel_item, }; /* Create a log intent done item for a log intent item. */ static inline void xfs_defer_create_done( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { struct xfs_log_item *lip; /* If there is no log intent item, there can be no log done item. */ if (!dfp->dfp_intent) return; /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: * * 1.) releases the log intent item and frees the log done item * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); if (!lip) return; tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_done = lip; } /* * Ensure there's a log intent item associated with this deferred work item if * the operation must be restarted on crash. Returns 1 if there's a log item; * 0 if there isn't; or a negative errno. */ static int xfs_defer_create_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp, bool sort) { struct xfs_log_item *lip; if (dfp->dfp_intent) return 1; lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); if (!lip) return 0; if (IS_ERR(lip)) return PTR_ERR(lip); tp->t_flags |= XFS_TRANS_DIRTY; xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_intent = lip; return 1; } /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of * the pending list. * * Returns 1 if at least one log item was associated with the deferred work; * 0 if there are no log items; or a negative errno. */ static int xfs_defer_create_intents( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; int ret = 0; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { int ret2; trace_xfs_defer_create_intent(tp->t_mountp, dfp); ret2 = xfs_defer_create_intent(tp, dfp, true); if (ret2 < 0) return ret2; ret |= ret2; } return ret; } static inline void xfs_defer_pending_abort( struct xfs_mount *mp, struct xfs_defer_pending *dfp) { trace_xfs_defer_pending_abort(mp, dfp); if (dfp->dfp_intent && !dfp->dfp_done) { dfp->dfp_ops->abort_intent(dfp->dfp_intent); dfp->dfp_intent = NULL; } } static inline void xfs_defer_pending_cancel_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp) { struct list_head *pwi; struct list_head *n; trace_xfs_defer_cancel_list(mp, dfp); list_del(&dfp->dfp_list); list_for_each_safe(pwi, n, &dfp->dfp_work) { list_del(pwi); dfp->dfp_count--; trace_xfs_defer_cancel_item(mp, dfp, pwi); dfp->dfp_ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); kmem_cache_free(xfs_defer_pending_cache, dfp); } STATIC void xfs_defer_pending_abort_list( struct xfs_mount *mp, struct list_head *dop_list) { struct xfs_defer_pending *dfp; /* Abort intent items that don't have a done item. */ list_for_each_entry(dfp, dop_list, dfp_list) xfs_defer_pending_abort(mp, dfp); } /* Abort all the intents that were committed. */ STATIC void xfs_defer_trans_abort( struct xfs_trans *tp, struct list_head *dop_pending) { trace_xfs_defer_trans_abort(tp, _RET_IP_); xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); } /* * Capture resources that the caller said not to release ("held") when the * transaction commits. Caller is responsible for zero-initializing @dres. */ static int xfs_defer_save_resources( struct xfs_defer_resources *dres, struct xfs_trans *tp) { struct xfs_buf_log_item *bli; struct xfs_inode_log_item *ili; struct xfs_log_item *lip; BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS); list_for_each_entry(lip, &tp->t_items, li_trans) { switch (lip->li_type) { case XFS_LI_BUF: bli = container_of(lip, struct xfs_buf_log_item, bli_item); if (bli->bli_flags & XFS_BLI_HOLD) { if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) { ASSERT(0); return -EFSCORRUPTED; } if (bli->bli_flags & XFS_BLI_ORDERED) dres->dr_ordered |= (1U << dres->dr_bufs); else xfs_trans_dirty_buf(tp, bli->bli_buf); dres->dr_bp[dres->dr_bufs++] = bli->bli_buf; } break; case XFS_LI_INODE: ili = container_of(lip, struct xfs_inode_log_item, ili_item); if (ili->ili_lock_flags == 0) { if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) { ASSERT(0); return -EFSCORRUPTED; } xfs_trans_log_inode(tp, ili->ili_inode, XFS_ILOG_CORE); dres->dr_ip[dres->dr_inos++] = ili->ili_inode; } break; default: break; } } return 0; } /* Attach the held resources to the transaction. */ static void xfs_defer_restore_resources( struct xfs_trans *tp, struct xfs_defer_resources *dres) { unsigned short i; /* Rejoin the joined inodes. */ for (i = 0; i < dres->dr_inos; i++) xfs_trans_ijoin(tp, dres->dr_ip[i], 0); /* Rejoin the buffers and dirty them so the log moves forward. */ for (i = 0; i < dres->dr_bufs; i++) { xfs_trans_bjoin(tp, dres->dr_bp[i]); if (dres->dr_ordered & (1U << i)) xfs_trans_ordered_buf(tp, dres->dr_bp[i]); xfs_trans_bhold(tp, dres->dr_bp[i]); } } /* Roll a transaction so we can do some deferred op processing. */ STATIC int xfs_defer_trans_roll( struct xfs_trans **tpp) { struct xfs_defer_resources dres = { }; int error; error = xfs_defer_save_resources(&dres, *tpp); if (error) return error; trace_xfs_defer_trans_roll(*tpp, _RET_IP_); /* * Roll the transaction. Rolling always given a new transaction (even * if committing the old one fails!) to hand back to the caller, so we * join the held resources to the new transaction so that we always * return with the held resources joined to @tpp, no matter what * happened. */ error = xfs_trans_roll(tpp); xfs_defer_restore_resources(*tpp, &dres); if (error) trace_xfs_defer_trans_roll_error(*tpp, error); return error; } /* * Free up any items left in the list. */ static void xfs_defer_cancel_list( struct xfs_mount *mp, struct list_head *dop_list) { struct xfs_defer_pending *dfp; struct xfs_defer_pending *pli; /* * Free the pending items. Caller should already have arranged * for the intent items to be released. */ list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) xfs_defer_pending_cancel_work(mp, dfp); } static inline void xfs_defer_relog_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { struct xfs_log_item *lip; xfs_defer_create_done(tp, dfp); lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); if (lip) { xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); } dfp->dfp_done = NULL; dfp->dfp_intent = lip; } /* * Prevent a log intent item from pinning the tail of the log by logging a * done item to release the intent item; and then log a new intent item. * The caller should provide a fresh transaction and roll it after we're done. */ static void xfs_defer_relog( struct xfs_trans **tpp, struct list_head *dfops) { struct xlog *log = (*tpp)->t_mountp->m_log; struct xfs_defer_pending *dfp; xfs_lsn_t threshold_lsn = NULLCOMMITLSN; ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); list_for_each_entry(dfp, dfops, dfp_list) { /* * If the log intent item for this deferred op is not a part of * the current log checkpoint, relog the intent item to keep * the log tail moving forward. We're ok with this being racy * because an incorrect decision means we'll be a little slower * at pushing the tail. */ if (dfp->dfp_intent == NULL || xfs_log_item_in_current_chkpt(dfp->dfp_intent)) continue; /* * Figure out where we need the tail to be in order to maintain * the minimum required free space in the log. Only sample * the log threshold once per call. */ if (threshold_lsn == NULLCOMMITLSN) { threshold_lsn = xfs_ail_get_push_target(log->l_ailp); if (threshold_lsn == NULLCOMMITLSN) break; } if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) continue; trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); XFS_STATS_INC((*tpp)->t_mountp, defer_relog); xfs_defer_relog_intent(*tpp, dfp); } } /* * Log an intent-done item for the first pending intent, and finish the work * items. */ int xfs_defer_finish_one( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { const struct xfs_defer_op_type *ops = dfp->dfp_ops; struct xfs_btree_cur *state = NULL; struct list_head *li, *n; int error; trace_xfs_defer_pending_finish(tp->t_mountp, dfp); xfs_defer_create_done(tp, dfp); list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; trace_xfs_defer_finish_item(tp->t_mountp, dfp, li); error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { int ret; /* * Caller wants a fresh transaction; put the work item * back on the list and log a new log intent item to * replace the old one. See "Requesting a Fresh * Transaction while Finishing Deferred Work" above. */ list_add(li, &dfp->dfp_work); dfp->dfp_count++; dfp->dfp_done = NULL; dfp->dfp_intent = NULL; ret = xfs_defer_create_intent(tp, dfp, false); if (ret < 0) error = ret; } if (error) goto out; } /* Done with the dfp, free it. */ list_del(&dfp->dfp_list); kmem_cache_free(xfs_defer_pending_cache, dfp); out: if (ops->finish_cleanup) ops->finish_cleanup(tp, state, error); return error; } /* Move all paused deferred work from @tp to @paused_list. */ static void xfs_defer_isolate_paused( struct xfs_trans *tp, struct list_head *paused_list) { struct xfs_defer_pending *dfp; struct xfs_defer_pending *pli; list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) continue; list_move_tail(&dfp->dfp_list, paused_list); trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); } } /* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if * one has even happened), rolling the transaction, and finishing the * work items in the first item on the logged-and-pending list. * * If an inode is provided, relog it to the new transaction. */ int xfs_defer_finish_noroll( struct xfs_trans **tp) { struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); LIST_HEAD(dop_paused); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); trace_xfs_defer_finish(*tp, _RET_IP_); /* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { /* * Deferred items that are created in the process of finishing * other deferred work items should be queued at the head of * the pending list, which puts them ahead of the deferred work * that was created by the caller. This keeps the number of * pending work items to a minimum, which decreases the amount * of time that any one intent item can stick around in memory, * pinning the log tail. */ int has_intents = xfs_defer_create_intents(*tp); xfs_defer_isolate_paused(*tp, &dop_paused); list_splice_init(&(*tp)->t_dfops, &dop_pending); if (has_intents < 0) { error = has_intents; goto out_shutdown; } if (has_intents || dfp) { error = xfs_defer_trans_roll(tp); if (error) goto out_shutdown; /* Relog intent items to keep the log moving. */ xfs_defer_relog(tp, &dop_pending); xfs_defer_relog(tp, &dop_paused); if ((*tp)->t_flags & XFS_TRANS_DIRTY) { error = xfs_defer_trans_roll(tp); if (error) goto out_shutdown; } } dfp = list_first_entry_or_null(&dop_pending, struct xfs_defer_pending, dfp_list); if (!dfp) break; error = xfs_defer_finish_one(*tp, dfp); if (error && error != -EAGAIN) goto out_shutdown; } /* Requeue the paused items in the outgoing transaction. */ list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; out_shutdown: list_splice_tail_init(&dop_paused, &dop_pending); xfs_defer_trans_abort(*tp, &dop_pending); xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); trace_xfs_defer_finish_error(*tp, error); xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); xfs_defer_cancel(*tp); return error; } int xfs_defer_finish( struct xfs_trans **tp) { #ifdef DEBUG struct xfs_defer_pending *dfp; #endif int error; /* * Finish and roll the transaction once more to avoid returning to the * caller with a dirty transaction. */ error = xfs_defer_finish_noroll(tp); if (error) return error; if ((*tp)->t_flags & XFS_TRANS_DIRTY) { error = xfs_defer_trans_roll(tp); if (error) { xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); return error; } } /* Reset LOWMODE now that we've finished all the dfops. */ #ifdef DEBUG list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); #endif (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } void xfs_defer_cancel( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; trace_xfs_defer_cancel(tp, _RET_IP_); xfs_defer_trans_abort(tp, &tp->t_dfops); xfs_defer_cancel_list(mp, &tp->t_dfops); } /* * Return the last pending work item attached to this transaction if it matches * the deferred op type. */ static inline struct xfs_defer_pending * xfs_defer_find_last( struct xfs_trans *tp, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = NULL; /* No dfops at all? */ if (list_empty(&tp->t_dfops)) return NULL; dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, dfp_list); /* Wrong type? */ if (dfp->dfp_ops != ops) return NULL; return dfp; } /* * Decide if we can add a deferred work item to the last dfops item attached * to the transaction. */ static inline bool xfs_defer_can_append( struct xfs_defer_pending *dfp, const struct xfs_defer_op_type *ops) { /* Already logged? */ if (dfp->dfp_intent) return false; /* Paused items cannot absorb more work */ if (dfp->dfp_flags & XFS_DEFER_PAUSED) return NULL; /* Already full? */ if (ops->max_items && dfp->dfp_count >= ops->max_items) return false; return true; } /* Create a new pending item at the end of the transaction list. */ static inline struct xfs_defer_pending * xfs_defer_alloc( struct list_head *dfops, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp; dfp = kmem_cache_zalloc(xfs_defer_pending_cache, GFP_KERNEL | __GFP_NOFAIL); dfp->dfp_ops = ops; INIT_LIST_HEAD(&dfp->dfp_work); list_add_tail(&dfp->dfp_list, dfops); return dfp; } /* Add an item for later deferred processing. */ struct xfs_defer_pending * xfs_defer_add( struct xfs_trans *tp, struct list_head *li, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = NULL; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); if (!ops->finish_item) { ASSERT(ops->finish_item != NULL); xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); return NULL; } dfp = xfs_defer_find_last(tp, ops); if (!dfp || !xfs_defer_can_append(dfp, ops)) dfp = xfs_defer_alloc(&tp->t_dfops, ops); xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); return dfp; } /* * Add a defer ops barrier to force two otherwise adjacent deferred work items * to be tracked separately and have separate log items. */ void xfs_defer_add_barrier( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); /* If the last defer op added was a barrier, we're done. */ dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type); if (dfp) return; xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type); trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); } /* * Create a pending deferred work item to replay the recovered intent item * and add it to the list. */ void xfs_defer_start_recovery( struct xfs_log_item *lip, struct list_head *r_dfops, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops); dfp->dfp_intent = lip; } /* * Cancel a deferred work item created to recover a log intent item. @dfp * will be freed after this function returns. */ void xfs_defer_cancel_recovery( struct xfs_mount *mp, struct xfs_defer_pending *dfp) { xfs_defer_pending_abort(mp, dfp); xfs_defer_pending_cancel_work(mp, dfp); } /* Replay the deferred work item created from a recovered log intent item. */ int xfs_defer_finish_recovery( struct xfs_mount *mp, struct xfs_defer_pending *dfp, struct list_head *capture_list) { const struct xfs_defer_op_type *ops = dfp->dfp_ops; int error; /* dfp is freed by recover_work and must not be accessed afterwards */ error = ops->recover_work(dfp, capture_list); if (error) trace_xlog_intent_recovery_failed(mp, ops, error); return error; } /* * Move deferred ops from one transaction to another and reset the source to * initial state. This is primarily used to carry state forward across * transaction rolls with pending dfops. */ void xfs_defer_move( struct xfs_trans *dtp, struct xfs_trans *stp) { list_splice_init(&stp->t_dfops, &dtp->t_dfops); /* * Low free space mode was historically controlled by a dfops field. * This meant that low mode state potentially carried across multiple * transaction rolls. Transfer low mode on a dfops move to preserve * that behavior. */ dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE); stp->t_flags &= ~XFS_TRANS_LOWMODE; } /* * Prepare a chain of fresh deferred ops work items to be completed later. Log * recovery requires the ability to put off until later the actual finishing * work so that it can process unfinished items recovered from the log in * correct order. * * Create and log intent items for all the work that we're capturing so that we * can be assured that the items will get replayed if the system goes down * before log recovery gets a chance to finish the work it put off. The entire * deferred ops state is transferred to the capture structure and the * transaction is then ready for the caller to commit it. If there are no * intent items to capture, this function returns NULL. * * If capture_ip is not NULL, the capture structure will obtain an extra * reference to the inode. */ static struct xfs_defer_capture * xfs_defer_ops_capture( struct xfs_trans *tp) { struct xfs_defer_capture *dfc; unsigned short i; int error; if (list_empty(&tp->t_dfops)) return NULL; error = xfs_defer_create_intents(tp); if (error < 0) return ERR_PTR(error); /* Create an object to capture the defer ops. */ dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); /* Move the dfops chain and transaction state to the capture struct. */ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; tp->t_flags &= ~XFS_TRANS_LOWMODE; /* Capture the remaining block reservations along with the dfops. */ dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; /* Preserve the log reservation size. */ dfc->dfc_logres = tp->t_log_res; error = xfs_defer_save_resources(&dfc->dfc_held, tp); if (error) { /* * Resource capture should never fail, but if it does, we * still have to shut down the log and release things * properly. */ xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); } /* * Grab extra references to the inodes and buffers because callers are * expected to release their held references after we commit the * transaction. */ for (i = 0; i < dfc->dfc_held.dr_inos; i++) { xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL); ihold(VFS_I(dfc->dfc_held.dr_ip[i])); } for (i = 0; i < dfc->dfc_held.dr_bufs; i++) xfs_buf_hold(dfc->dfc_held.dr_bp[i]); return dfc; } /* Release all resources that we used to capture deferred ops. */ void xfs_defer_ops_capture_abort( struct xfs_mount *mp, struct xfs_defer_capture *dfc) { unsigned short i; xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); xfs_defer_cancel_list(mp, &dfc->dfc_dfops); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) xfs_buf_relse(dfc->dfc_held.dr_bp[i]); for (i = 0; i < dfc->dfc_held.dr_inos; i++) xfs_irele(dfc->dfc_held.dr_ip[i]); kfree(dfc); } /* * Capture any deferred ops and commit the transaction. This is the last step * needed to finish a log intent item that we recovered from the log. If any * of the deferred ops operate on an inode, the caller must pass in that inode * so that the reference can be transferred to the capture structure. The * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling * xfs_defer_ops_continue. */ int xfs_defer_ops_capture_and_commit( struct xfs_trans *tp, struct list_head *capture_list) { struct xfs_mount *mp = tp->t_mountp; struct xfs_defer_capture *dfc; int error; /* If we don't capture anything, commit transaction and exit. */ dfc = xfs_defer_ops_capture(tp); if (IS_ERR(dfc)) { xfs_trans_cancel(tp); return PTR_ERR(dfc); } if (!dfc) return xfs_trans_commit(tp); /* Commit the transaction and add the capture structure to the list. */ error = xfs_trans_commit(tp); if (error) { xfs_defer_ops_capture_abort(mp, dfc); return error; } list_add_tail(&dfc->dfc_list, capture_list); return 0; } /* * Attach a chain of captured deferred ops to a new transaction and free the * capture structure. If an inode was captured, it will be passed back to the * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. * The caller now owns the inode reference. */ void xfs_defer_ops_continue( struct xfs_defer_capture *dfc, struct xfs_trans *tp, struct xfs_defer_resources *dres) { unsigned int i; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); /* Lock the captured resources to the new transaction. */ if (dfc->dfc_held.dr_inos > 2) { xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos); xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos, XFS_ILOCK_EXCL); } else if (dfc->dfc_held.dr_inos == 2) xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); else if (dfc->dfc_held.dr_inos == 1) xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) xfs_buf_lock(dfc->dfc_held.dr_bp[i]); /* Join the captured resources to the new transaction. */ xfs_defer_restore_resources(tp, &dfc->dfc_held); memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); dres->dr_bufs = 0; /* Move captured dfops chain and state to the transaction. */ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); tp->t_flags |= dfc->dfc_tpflags; kfree(dfc); } /* Release the resources captured and continued during recovery. */ void xfs_defer_resources_rele( struct xfs_defer_resources *dres) { unsigned short i; for (i = 0; i < dres->dr_inos; i++) { xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL); xfs_irele(dres->dr_ip[i]); dres->dr_ip[i] = NULL; } for (i = 0; i < dres->dr_bufs; i++) { xfs_buf_relse(dres->dr_bp[i]); dres->dr_bp[i] = NULL; } dres->dr_inos = 0; dres->dr_bufs = 0; dres->dr_ordered = 0; } static inline int __init xfs_defer_init_cache(void) { xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending", sizeof(struct xfs_defer_pending), 0, 0, NULL); return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM; } static inline void xfs_defer_destroy_cache(void) { kmem_cache_destroy(xfs_defer_pending_cache); xfs_defer_pending_cache = NULL; } /* Set up caches for deferred work items. */ int __init xfs_defer_init_item_caches(void) { int error; error = xfs_defer_init_cache(); if (error) return error; error = xfs_rmap_intent_init_cache(); if (error) goto err; error = xfs_refcount_intent_init_cache(); if (error) goto err; error = xfs_bmap_intent_init_cache(); if (error) goto err; error = xfs_extfree_intent_init_cache(); if (error) goto err; error = xfs_attr_intent_init_cache(); if (error) goto err; error = xfs_exchmaps_intent_init_cache(); if (error) goto err; return 0; err: xfs_defer_destroy_item_caches(); return error; } /* Destroy all the deferred work item caches, if they've been allocated. */ void xfs_defer_destroy_item_caches(void) { xfs_exchmaps_intent_destroy_cache(); xfs_attr_intent_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); xfs_refcount_intent_destroy_cache(); xfs_rmap_intent_destroy_cache(); xfs_defer_destroy_cache(); } /* * Mark a deferred work item so that it will be requeued indefinitely without * being finished. Caller must ensure there are no data dependencies on this * work item in the meantime. */ void xfs_defer_item_pause( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); dfp->dfp_flags |= XFS_DEFER_PAUSED; trace_xfs_defer_item_pause(tp->t_mountp, dfp); } /* * Release a paused deferred work item so that it will be finished during the * next transaction roll. */ void xfs_defer_item_unpause( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); dfp->dfp_flags &= ~XFS_DEFER_PAUSED; trace_xfs_defer_item_unpause(tp->t_mountp, dfp); } |
| 20 19 1 1 14 1 12 6 2 1 4 4 5 1 3 1 1 1 2 1 1 1 5 5 1 3 1 3 1 1 1 1 1 13 1 1 3 2 1 19 1 5 1 22 1 1 1 1 1 4 2 1 1 1 1 8 1 1 2 1 1 1 2 1 1 2 1 1 9 6 3 1 1 1 4 1 1 3 1 1 1 1 16 1 1 2 3 1 1 1 7 3 5 1 3 3 1 4 143 2 8 18 7 27 9 12 5 4 4 11 7 6 19 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 | // SPDX-License-Identifier: GPL-2.0-only /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. */ #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> #include <linux/miscdevice.h> #include <linux/interrupt.h> #include <linux/highmem.h> #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/pci.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/io.h> #include "vmci_handle_array.h" #include "vmci_queue_pair.h" #include "vmci_datagram.h" #include "vmci_doorbell.h" #include "vmci_resource.h" #include "vmci_context.h" #include "vmci_driver.h" #include "vmci_event.h" #define VMCI_UTIL_NUM_RESOURCES 1 enum { VMCI_NOTIFY_RESOURCE_QUEUE_PAIR = 0, VMCI_NOTIFY_RESOURCE_DOOR_BELL = 1, }; enum { VMCI_NOTIFY_RESOURCE_ACTION_NOTIFY = 0, VMCI_NOTIFY_RESOURCE_ACTION_CREATE = 1, VMCI_NOTIFY_RESOURCE_ACTION_DESTROY = 2, }; /* * VMCI driver initialization. This block can also be used to * pass initial group membership etc. */ struct vmci_init_blk { u32 cid; u32 flags; }; /* VMCIqueue_pairAllocInfo_VMToVM */ struct vmci_qp_alloc_info_vmvm { struct vmci_handle handle; u32 peer; u32 flags; u64 produce_size; u64 consume_size; u64 produce_page_file; /* User VA. */ u64 consume_page_file; /* User VA. */ u64 produce_page_file_size; /* Size of the file name array. */ u64 consume_page_file_size; /* Size of the file name array. */ s32 result; u32 _pad; }; /* VMCISetNotifyInfo: Used to pass notify flag's address to the host driver. */ struct vmci_set_notify_info { u64 notify_uva; s32 result; u32 _pad; }; /* * Per-instance host state */ struct vmci_host_dev { struct vmci_ctx *context; int user_version; enum vmci_obj_type ct_type; struct mutex lock; /* Mutex lock for vmci context access */ }; static struct vmci_ctx *host_context; static bool vmci_host_device_initialized; static atomic_t vmci_host_active_users = ATOMIC_INIT(0); /* * Determines whether the VMCI host personality is * available. Since the core functionality of the host driver is * always present, all guests could possibly use the host * personality. However, to minimize the deviation from the * pre-unified driver state of affairs, we only consider the host * device active if there is no active guest device or if there * are VMX'en with active VMCI contexts using the host device. */ bool vmci_host_code_active(void) { return vmci_host_device_initialized && (!vmci_guest_code_active() || atomic_read(&vmci_host_active_users) > 0); } int vmci_host_users(void) { return atomic_read(&vmci_host_active_users); } /* * Called on open of /dev/vmci. */ static int vmci_host_open(struct inode *inode, struct file *filp) { struct vmci_host_dev *vmci_host_dev; vmci_host_dev = kzalloc(sizeof(struct vmci_host_dev), GFP_KERNEL); if (vmci_host_dev == NULL) return -ENOMEM; vmci_host_dev->ct_type = VMCIOBJ_NOT_SET; mutex_init(&vmci_host_dev->lock); filp->private_data = vmci_host_dev; return 0; } /* * Called on close of /dev/vmci, most often when the process * exits. */ static int vmci_host_close(struct inode *inode, struct file *filp) { struct vmci_host_dev *vmci_host_dev = filp->private_data; if (vmci_host_dev->ct_type == VMCIOBJ_CONTEXT) { vmci_ctx_destroy(vmci_host_dev->context); vmci_host_dev->context = NULL; /* * The number of active contexts is used to track whether any * VMX'en are using the host personality. It is incremented when * a context is created through the IOCTL_VMCI_INIT_CONTEXT * ioctl. */ atomic_dec(&vmci_host_active_users); } vmci_host_dev->ct_type = VMCIOBJ_NOT_SET; kfree(vmci_host_dev); filp->private_data = NULL; return 0; } /* * This is used to wake up the VMX when a VMCI call arrives, or * to wake up select() or poll() at the next clock tick. */ static __poll_t vmci_host_poll(struct file *filp, poll_table *wait) { struct vmci_host_dev *vmci_host_dev = filp->private_data; struct vmci_ctx *context; __poll_t mask = 0; if (vmci_host_dev->ct_type == VMCIOBJ_CONTEXT) { /* * Read context only if ct_type == VMCIOBJ_CONTEXT to make * sure that context is initialized */ context = vmci_host_dev->context; /* Check for VMCI calls to this VM context. */ if (wait) poll_wait(filp, &context->host_context.wait_queue, wait); spin_lock(&context->lock); if (context->pending_datagrams > 0 || vmci_handle_arr_get_size( context->pending_doorbell_array) > 0) { mask = EPOLLIN; } spin_unlock(&context->lock); } return mask; } /* * Copies the handles of a handle array into a user buffer, and * returns the new length in userBufferSize. If the copy to the * user buffer fails, the functions still returns VMCI_SUCCESS, * but retval != 0. */ static int drv_cp_harray_to_user(void __user *user_buf_uva, u64 *user_buf_size, struct vmci_handle_arr *handle_array, int *retval) { u32 array_size = 0; struct vmci_handle *handles; if (handle_array) array_size = vmci_handle_arr_get_size(handle_array); if (array_size * sizeof(*handles) > *user_buf_size) return VMCI_ERROR_MORE_DATA; *user_buf_size = array_size * sizeof(*handles); if (*user_buf_size) *retval = copy_to_user(user_buf_uva, vmci_handle_arr_get_handles (handle_array), *user_buf_size); return VMCI_SUCCESS; } /* * Sets up a given context for notify to work. Maps the notify * boolean in user VA into kernel space. */ static int vmci_host_setup_notify(struct vmci_ctx *context, unsigned long uva) { int retval; if (context->notify_page) { pr_devel("%s: Notify mechanism is already set up\n", __func__); return VMCI_ERROR_DUPLICATE_ENTRY; } /* * We are using 'bool' internally, but let's make sure we explicit * about the size. */ BUILD_BUG_ON(sizeof(bool) != sizeof(u8)); /* * Lock physical page backing a given user VA. */ retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page); if (retval != 1) { context->notify_page = NULL; return VMCI_ERROR_GENERIC; } if (context->notify_page == NULL) return VMCI_ERROR_UNAVAILABLE; /* * Map the locked page and set up notify pointer. */ context->notify = kmap(context->notify_page) + (uva & (PAGE_SIZE - 1)); vmci_ctx_check_signal_notify(context); return VMCI_SUCCESS; } static int vmci_host_get_version(struct vmci_host_dev *vmci_host_dev, unsigned int cmd, void __user *uptr) { if (cmd == IOCTL_VMCI_VERSION2) { int __user *vptr = uptr; if (get_user(vmci_host_dev->user_version, vptr)) return -EFAULT; } /* * The basic logic here is: * * If the user sends in a version of 0 tell it our version. * If the user didn't send in a version, tell it our version. * If the user sent in an old version, tell it -its- version. * If the user sent in an newer version, tell it our version. * * The rationale behind telling the caller its version is that * Workstation 6.5 required that VMX and VMCI kernel module were * version sync'd. All new VMX users will be programmed to * handle the VMCI kernel module version. */ if (vmci_host_dev->user_version > 0 && vmci_host_dev->user_version < VMCI_VERSION_HOSTQP) { return vmci_host_dev->user_version; } return VMCI_VERSION; } #define vmci_ioctl_err(fmt, ...) \ pr_devel("%s: " fmt, ioctl_name, ##__VA_ARGS__) static int vmci_host_do_init_context(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_init_blk init_block; const struct cred *cred; int retval; if (copy_from_user(&init_block, uptr, sizeof(init_block))) { vmci_ioctl_err("error reading init block\n"); return -EFAULT; } mutex_lock(&vmci_host_dev->lock); if (vmci_host_dev->ct_type != VMCIOBJ_NOT_SET) { vmci_ioctl_err("received VMCI init on initialized handle\n"); retval = -EINVAL; goto out; } if (init_block.flags & ~VMCI_PRIVILEGE_FLAG_RESTRICTED) { vmci_ioctl_err("unsupported VMCI restriction flag\n"); retval = -EINVAL; goto out; } cred = get_current_cred(); vmci_host_dev->context = vmci_ctx_create(init_block.cid, init_block.flags, 0, vmci_host_dev->user_version, cred); put_cred(cred); if (IS_ERR(vmci_host_dev->context)) { retval = PTR_ERR(vmci_host_dev->context); vmci_ioctl_err("error initializing context\n"); goto out; } /* * Copy cid to userlevel, we do this to allow the VMX * to enforce its policy on cid generation. */ init_block.cid = vmci_ctx_get_id(vmci_host_dev->context); if (copy_to_user(uptr, &init_block, sizeof(init_block))) { vmci_ctx_destroy(vmci_host_dev->context); vmci_host_dev->context = NULL; vmci_ioctl_err("error writing init block\n"); retval = -EFAULT; goto out; } vmci_host_dev->ct_type = VMCIOBJ_CONTEXT; atomic_inc(&vmci_host_active_users); vmci_call_vsock_callback(true); retval = 0; out: mutex_unlock(&vmci_host_dev->lock); return retval; } static int vmci_host_do_send_datagram(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_datagram_snd_rcv_info send_info; struct vmci_datagram *dg = NULL; u32 cid; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&send_info, uptr, sizeof(send_info))) return -EFAULT; if (send_info.len > VMCI_MAX_DG_SIZE) { vmci_ioctl_err("datagram is too big (size=%d)\n", send_info.len); return -EINVAL; } if (send_info.len < sizeof(*dg)) { vmci_ioctl_err("datagram is too small (size=%d)\n", send_info.len); return -EINVAL; } dg = memdup_user((void __user *)(uintptr_t)send_info.addr, send_info.len); if (IS_ERR(dg)) { vmci_ioctl_err( "cannot allocate memory to dispatch datagram\n"); return PTR_ERR(dg); } if (VMCI_DG_SIZE(dg) != send_info.len) { vmci_ioctl_err("datagram size mismatch\n"); kfree(dg); return -EINVAL; } pr_devel("Datagram dst (handle=0x%x:0x%x) src (handle=0x%x:0x%x), payload (size=%llu bytes)\n", dg->dst.context, dg->dst.resource, dg->src.context, dg->src.resource, (unsigned long long)dg->payload_size); /* Get source context id. */ cid = vmci_ctx_get_id(vmci_host_dev->context); send_info.result = vmci_datagram_dispatch(cid, dg, true); kfree(dg); return copy_to_user(uptr, &send_info, sizeof(send_info)) ? -EFAULT : 0; } static int vmci_host_do_receive_datagram(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_datagram_snd_rcv_info recv_info; struct vmci_datagram *dg = NULL; int retval; size_t size; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&recv_info, uptr, sizeof(recv_info))) return -EFAULT; size = recv_info.len; recv_info.result = vmci_ctx_dequeue_datagram(vmci_host_dev->context, &size, &dg); if (recv_info.result >= VMCI_SUCCESS) { void __user *ubuf = (void __user *)(uintptr_t)recv_info.addr; retval = copy_to_user(ubuf, dg, VMCI_DG_SIZE(dg)); kfree(dg); if (retval != 0) return -EFAULT; } return copy_to_user(uptr, &recv_info, sizeof(recv_info)) ? -EFAULT : 0; } static int vmci_host_do_alloc_queuepair(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_handle handle; int vmci_status; int __user *retptr; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (vmci_host_dev->user_version < VMCI_VERSION_NOVMVM) { struct vmci_qp_alloc_info_vmvm alloc_info; struct vmci_qp_alloc_info_vmvm __user *info = uptr; if (copy_from_user(&alloc_info, uptr, sizeof(alloc_info))) return -EFAULT; handle = alloc_info.handle; retptr = &info->result; vmci_status = vmci_qp_broker_alloc(alloc_info.handle, alloc_info.peer, alloc_info.flags, VMCI_NO_PRIVILEGE_FLAGS, alloc_info.produce_size, alloc_info.consume_size, NULL, vmci_host_dev->context); if (vmci_status == VMCI_SUCCESS) vmci_status = VMCI_SUCCESS_QUEUEPAIR_CREATE; } else { struct vmci_qp_alloc_info alloc_info; struct vmci_qp_alloc_info __user *info = uptr; struct vmci_qp_page_store page_store; if (copy_from_user(&alloc_info, uptr, sizeof(alloc_info))) return -EFAULT; handle = alloc_info.handle; retptr = &info->result; page_store.pages = alloc_info.ppn_va; page_store.len = alloc_info.num_ppns; vmci_status = vmci_qp_broker_alloc(alloc_info.handle, alloc_info.peer, alloc_info.flags, VMCI_NO_PRIVILEGE_FLAGS, alloc_info.produce_size, alloc_info.consume_size, &page_store, vmci_host_dev->context); } if (put_user(vmci_status, retptr)) { if (vmci_status >= VMCI_SUCCESS) { vmci_status = vmci_qp_broker_detach(handle, vmci_host_dev->context); } return -EFAULT; } return 0; } static int vmci_host_do_queuepair_setva(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_qp_set_va_info set_va_info; struct vmci_qp_set_va_info __user *info = uptr; s32 result; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (vmci_host_dev->user_version < VMCI_VERSION_NOVMVM) { vmci_ioctl_err("is not allowed\n"); return -EINVAL; } if (copy_from_user(&set_va_info, uptr, sizeof(set_va_info))) return -EFAULT; if (set_va_info.va) { /* * VMX is passing down a new VA for the queue * pair mapping. */ result = vmci_qp_broker_map(set_va_info.handle, vmci_host_dev->context, set_va_info.va); } else { /* * The queue pair is about to be unmapped by * the VMX. */ result = vmci_qp_broker_unmap(set_va_info.handle, vmci_host_dev->context, 0); } return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_queuepair_setpf(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_qp_page_file_info page_file_info; struct vmci_qp_page_file_info __user *info = uptr; s32 result; if (vmci_host_dev->user_version < VMCI_VERSION_HOSTQP || vmci_host_dev->user_version >= VMCI_VERSION_NOVMVM) { vmci_ioctl_err("not supported on this VMX (version=%d)\n", vmci_host_dev->user_version); return -EINVAL; } if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&page_file_info, uptr, sizeof(*info))) return -EFAULT; /* * Communicate success pre-emptively to the caller. Note that the * basic premise is that it is incumbent upon the caller not to look at * the info.result field until after the ioctl() returns. And then, * only if the ioctl() result indicates no error. We send up the * SUCCESS status before calling SetPageStore() store because failing * to copy up the result code means unwinding the SetPageStore(). * * It turns out the logic to unwind a SetPageStore() opens a can of * worms. For example, if a host had created the queue_pair and a * guest attaches and SetPageStore() is successful but writing success * fails, then ... the host has to be stopped from writing (anymore) * data into the queue_pair. That means an additional test in the * VMCI_Enqueue() code path. Ugh. */ if (put_user(VMCI_SUCCESS, &info->result)) { /* * In this case, we can't write a result field of the * caller's info block. So, we don't even try to * SetPageStore(). */ return -EFAULT; } result = vmci_qp_broker_set_page_store(page_file_info.handle, page_file_info.produce_va, page_file_info.consume_va, vmci_host_dev->context); if (result < VMCI_SUCCESS) { if (put_user(result, &info->result)) { /* * Note that in this case the SetPageStore() * call failed but we were unable to * communicate that to the caller (because the * copy_to_user() call failed). So, if we * simply return an error (in this case * -EFAULT) then the caller will know that the * SetPageStore failed even though we couldn't * put the result code in the result field and * indicate exactly why it failed. * * That says nothing about the issue where we * were once able to write to the caller's info * memory and now can't. Something more * serious is probably going on than the fact * that SetPageStore() didn't work. */ return -EFAULT; } } return 0; } static int vmci_host_do_qp_detach(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_qp_dtch_info detach_info; struct vmci_qp_dtch_info __user *info = uptr; s32 result; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&detach_info, uptr, sizeof(detach_info))) return -EFAULT; result = vmci_qp_broker_detach(detach_info.handle, vmci_host_dev->context); if (result == VMCI_SUCCESS && vmci_host_dev->user_version < VMCI_VERSION_NOVMVM) { result = VMCI_SUCCESS_LAST_DETACH; } return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_ctx_add_notify(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_info ar_info; struct vmci_ctx_info __user *info = uptr; s32 result; u32 cid; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&ar_info, uptr, sizeof(ar_info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); result = vmci_ctx_add_notification(cid, ar_info.remote_cid); return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_ctx_remove_notify(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_info ar_info; struct vmci_ctx_info __user *info = uptr; u32 cid; int result; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&ar_info, uptr, sizeof(ar_info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); result = vmci_ctx_remove_notification(cid, ar_info.remote_cid); return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_ctx_get_cpt_state(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_chkpt_buf_info get_info; u32 cid; void *cpt_buf; int retval; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&get_info, uptr, sizeof(get_info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); get_info.result = vmci_ctx_get_chkpt_state(cid, get_info.cpt_type, &get_info.buf_size, &cpt_buf); if (get_info.result == VMCI_SUCCESS && get_info.buf_size) { void __user *ubuf = (void __user *)(uintptr_t)get_info.cpt_buf; retval = copy_to_user(ubuf, cpt_buf, get_info.buf_size); kfree(cpt_buf); if (retval) return -EFAULT; } return copy_to_user(uptr, &get_info, sizeof(get_info)) ? -EFAULT : 0; } static int vmci_host_do_ctx_set_cpt_state(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_chkpt_buf_info set_info; u32 cid; void *cpt_buf; int retval; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&set_info, uptr, sizeof(set_info))) return -EFAULT; cpt_buf = memdup_user((void __user *)(uintptr_t)set_info.cpt_buf, set_info.buf_size); if (IS_ERR(cpt_buf)) return PTR_ERR(cpt_buf); cid = vmci_ctx_get_id(vmci_host_dev->context); set_info.result = vmci_ctx_set_chkpt_state(cid, set_info.cpt_type, set_info.buf_size, cpt_buf); retval = copy_to_user(uptr, &set_info, sizeof(set_info)) ? -EFAULT : 0; kfree(cpt_buf); return retval; } static int vmci_host_do_get_context_id(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { u32 __user *u32ptr = uptr; return put_user(VMCI_HOST_CONTEXT_ID, u32ptr) ? -EFAULT : 0; } static int vmci_host_do_set_notify(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_set_notify_info notify_info; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(¬ify_info, uptr, sizeof(notify_info))) return -EFAULT; if (notify_info.notify_uva) { notify_info.result = vmci_host_setup_notify(vmci_host_dev->context, notify_info.notify_uva); } else { vmci_ctx_unset_notify(vmci_host_dev->context); notify_info.result = VMCI_SUCCESS; } return copy_to_user(uptr, ¬ify_info, sizeof(notify_info)) ? -EFAULT : 0; } static int vmci_host_do_notify_resource(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_dbell_notify_resource_info info; u32 cid; if (vmci_host_dev->user_version < VMCI_VERSION_NOTIFY) { vmci_ioctl_err("invalid for current VMX versions\n"); return -EINVAL; } if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&info, uptr, sizeof(info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); switch (info.action) { case VMCI_NOTIFY_RESOURCE_ACTION_NOTIFY: if (info.resource == VMCI_NOTIFY_RESOURCE_DOOR_BELL) { u32 flags = VMCI_NO_PRIVILEGE_FLAGS; info.result = vmci_ctx_notify_dbell(cid, info.handle, flags); } else { info.result = VMCI_ERROR_UNAVAILABLE; } break; case VMCI_NOTIFY_RESOURCE_ACTION_CREATE: info.result = vmci_ctx_dbell_create(cid, info.handle); break; case VMCI_NOTIFY_RESOURCE_ACTION_DESTROY: info.result = vmci_ctx_dbell_destroy(cid, info.handle); break; default: vmci_ioctl_err("got unknown action (action=%d)\n", info.action); info.result = VMCI_ERROR_INVALID_ARGS; } return copy_to_user(uptr, &info, sizeof(info)) ? -EFAULT : 0; } static int vmci_host_do_recv_notifications(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_notify_recv_info info; struct vmci_handle_arr *db_handle_array; struct vmci_handle_arr *qp_handle_array; void __user *ubuf; u32 cid; int retval = 0; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (vmci_host_dev->user_version < VMCI_VERSION_NOTIFY) { vmci_ioctl_err("not supported for the current vmx version\n"); return -EINVAL; } if (copy_from_user(&info, uptr, sizeof(info))) return -EFAULT; if ((info.db_handle_buf_size && !info.db_handle_buf_uva) || (info.qp_handle_buf_size && !info.qp_handle_buf_uva)) { return -EINVAL; } cid = vmci_ctx_get_id(vmci_host_dev->context); info.result = vmci_ctx_rcv_notifications_get(cid, &db_handle_array, &qp_handle_array); if (info.result != VMCI_SUCCESS) return copy_to_user(uptr, &info, sizeof(info)) ? -EFAULT : 0; ubuf = (void __user *)(uintptr_t)info.db_handle_buf_uva; info.result = drv_cp_harray_to_user(ubuf, &info.db_handle_buf_size, db_handle_array, &retval); if (info.result == VMCI_SUCCESS && !retval) { ubuf = (void __user *)(uintptr_t)info.qp_handle_buf_uva; info.result = drv_cp_harray_to_user(ubuf, &info.qp_handle_buf_size, qp_handle_array, &retval); } if (!retval && copy_to_user(uptr, &info, sizeof(info))) retval = -EFAULT; vmci_ctx_rcv_notifications_release(cid, db_handle_array, qp_handle_array, info.result == VMCI_SUCCESS && !retval); return retval; } static long vmci_host_unlocked_ioctl(struct file *filp, unsigned int iocmd, unsigned long ioarg) { #define VMCI_DO_IOCTL(ioctl_name, ioctl_fn) do { \ char *name = "IOCTL_VMCI_" # ioctl_name; \ return vmci_host_do_ ## ioctl_fn( \ vmci_host_dev, name, uptr); \ } while (0) struct vmci_host_dev *vmci_host_dev = filp->private_data; void __user *uptr = (void __user *)ioarg; switch (iocmd) { case IOCTL_VMCI_INIT_CONTEXT: VMCI_DO_IOCTL(INIT_CONTEXT, init_context); case IOCTL_VMCI_DATAGRAM_SEND: VMCI_DO_IOCTL(DATAGRAM_SEND, send_datagram); case IOCTL_VMCI_DATAGRAM_RECEIVE: VMCI_DO_IOCTL(DATAGRAM_RECEIVE, receive_datagram); case IOCTL_VMCI_QUEUEPAIR_ALLOC: VMCI_DO_IOCTL(QUEUEPAIR_ALLOC, alloc_queuepair); case IOCTL_VMCI_QUEUEPAIR_SETVA: VMCI_DO_IOCTL(QUEUEPAIR_SETVA, queuepair_setva); case IOCTL_VMCI_QUEUEPAIR_SETPAGEFILE: VMCI_DO_IOCTL(QUEUEPAIR_SETPAGEFILE, queuepair_setpf); case IOCTL_VMCI_QUEUEPAIR_DETACH: VMCI_DO_IOCTL(QUEUEPAIR_DETACH, qp_detach); case IOCTL_VMCI_CTX_ADD_NOTIFICATION: VMCI_DO_IOCTL(CTX_ADD_NOTIFICATION, ctx_add_notify); case IOCTL_VMCI_CTX_REMOVE_NOTIFICATION: VMCI_DO_IOCTL(CTX_REMOVE_NOTIFICATION, ctx_remove_notify); case IOCTL_VMCI_CTX_GET_CPT_STATE: VMCI_DO_IOCTL(CTX_GET_CPT_STATE, ctx_get_cpt_state); case IOCTL_VMCI_CTX_SET_CPT_STATE: VMCI_DO_IOCTL(CTX_SET_CPT_STATE, ctx_set_cpt_state); case IOCTL_VMCI_GET_CONTEXT_ID: VMCI_DO_IOCTL(GET_CONTEXT_ID, get_context_id); case IOCTL_VMCI_SET_NOTIFY: VMCI_DO_IOCTL(SET_NOTIFY, set_notify); case IOCTL_VMCI_NOTIFY_RESOURCE: VMCI_DO_IOCTL(NOTIFY_RESOURCE, notify_resource); case IOCTL_VMCI_NOTIFICATIONS_RECEIVE: VMCI_DO_IOCTL(NOTIFICATIONS_RECEIVE, recv_notifications); case IOCTL_VMCI_VERSION: case IOCTL_VMCI_VERSION2: return vmci_host_get_version(vmci_host_dev, iocmd, uptr); default: pr_devel("%s: Unknown ioctl (iocmd=%d)\n", __func__, iocmd); return -EINVAL; } #undef VMCI_DO_IOCTL } static const struct file_operations vmuser_fops = { .owner = THIS_MODULE, .open = vmci_host_open, .release = vmci_host_close, .poll = vmci_host_poll, .unlocked_ioctl = vmci_host_unlocked_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static struct miscdevice vmci_host_miscdev = { .name = "vmci", .minor = MISC_DYNAMIC_MINOR, .fops = &vmuser_fops, }; int __init vmci_host_init(void) { int error; host_context = vmci_ctx_create(VMCI_HOST_CONTEXT_ID, VMCI_DEFAULT_PROC_PRIVILEGE_FLAGS, -1, VMCI_VERSION, NULL); if (IS_ERR(host_context)) { error = PTR_ERR(host_context); pr_warn("Failed to initialize VMCIContext (error%d)\n", error); return error; } error = misc_register(&vmci_host_miscdev); if (error) { pr_warn("Module registration error (name=%s, major=%d, minor=%d, err=%d)\n", vmci_host_miscdev.name, MISC_MAJOR, vmci_host_miscdev.minor, error); pr_warn("Unable to initialize host personality\n"); vmci_ctx_destroy(host_context); return error; } pr_info("VMCI host device registered (name=%s, major=%d, minor=%d)\n", vmci_host_miscdev.name, MISC_MAJOR, vmci_host_miscdev.minor); vmci_host_device_initialized = true; return 0; } void __exit vmci_host_exit(void) { vmci_host_device_initialized = false; misc_deregister(&vmci_host_miscdev); vmci_ctx_destroy(host_context); vmci_qp_broker_exit(); pr_debug("VMCI host driver module unloaded\n"); } |
| 432 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Access to user system call parameters and results * * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. * * See asm-generic/syscall.h for descriptions of what we must do here. */ #ifndef _ASM_X86_SYSCALL_H #define _ASM_X86_SYSCALL_H #include <uapi/linux/audit.h> #include <linux/sched.h> #include <linux/err.h> #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> /* This is used purely for kernel/trace/trace_syscalls.c */ typedef long (*sys_call_ptr_t)(const struct pt_regs *); extern const sys_call_ptr_t sys_call_table[]; /* * These may not exist, but still put the prototypes in so we * can use IS_ENABLED(). */ extern long ia32_sys_call(const struct pt_regs *, unsigned int nr); extern long x32_sys_call(const struct pt_regs *, unsigned int nr); extern long x64_sys_call(const struct pt_regs *, unsigned int nr); /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons * sign-extend the low 32 bits. */ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { return regs->orig_ax; } static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { regs->ax = regs->orig_ax; } static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { unsigned long error = regs->ax; #ifdef CONFIG_IA32_EMULATION /* * TS_COMPAT is set for 32-bit syscall entries and then * remains set until we return to user mode. */ if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED)) /* * Sign-extend the value so (int)-EFOO becomes (long)-EFOO * and will match correctly in comparisons. */ error = (long) (int) error; #endif return IS_ERR_VALUE(error) ? error : 0; } static inline long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs) { return regs->ax; } static inline void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, int error, long val) { regs->ax = (long) error ?: val; } #ifdef CONFIG_X86_32 static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args) { args[0] = regs->bx; args[1] = regs->cx; args[2] = regs->dx; args[3] = regs->si; args[4] = regs->di; args[5] = regs->bp; } static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_I386; } #else /* CONFIG_X86_64 */ static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args) { # ifdef CONFIG_IA32_EMULATION if (task->thread_info.status & TS_COMPAT) { *args++ = regs->bx; *args++ = regs->cx; *args++ = regs->dx; *args++ = regs->si; *args++ = regs->di; *args = regs->bp; } else # endif { *args++ = regs->di; *args++ = regs->si; *args++ = regs->dx; *args++ = regs->r10; *args++ = regs->r8; *args = regs->r9; } } static inline int syscall_get_arch(struct task_struct *task) { /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ return (IS_ENABLED(CONFIG_IA32_EMULATION) && task->thread_info.status & TS_COMPAT) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } bool do_syscall_64(struct pt_regs *regs, int nr); void do_int80_emulation(struct pt_regs *regs); #endif /* CONFIG_X86_32 */ void do_int80_syscall_32(struct pt_regs *regs); bool do_fast_syscall_32(struct pt_regs *regs); bool do_SYSENTER_32(struct pt_regs *regs); #endif /* _ASM_X86_SYSCALL_H */ |
| 56 86 86 73 72 26 50 50 35 29 17 17 11 1 11 18 34 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Shared crypto simd helpers * * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * Copyright (c) 2016 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2019 Google LLC * * Based on aesni-intel_glue.c by: * Copyright (C) 2008, Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ /* * Shared crypto SIMD helpers. These functions dynamically create and register * an skcipher or AEAD algorithm that wraps another, internal algorithm. The * wrapper ensures that the internal algorithm is only executed in a context * where SIMD instructions are usable, i.e. where may_use_simd() returns true. * If SIMD is already usable, the wrapper directly calls the internal algorithm. * Otherwise it defers execution to a workqueue via cryptd. * * This is an alternative to the internal algorithm implementing a fallback for * the !may_use_simd() case itself. * * Note that the wrapper algorithm is asynchronous, i.e. it has the * CRYPTO_ALG_ASYNC flag set. Therefore it won't be found by users who * explicitly allocate a synchronous algorithm. */ #include <crypto/cryptd.h> #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/preempt.h> #include <asm/simd.h> /* skcipher support */ struct simd_skcipher_alg { const char *ialg_name; struct skcipher_alg alg; }; struct simd_skcipher_ctx { struct cryptd_skcipher *cryptd_tfm; }; static int simd_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *child = &ctx->cryptd_tfm->base; crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_skcipher_setkey(child, key, key_len); } static int simd_skcipher_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_request *subreq; struct crypto_skcipher *child; subreq = skcipher_request_ctx(req); *subreq = *req; if (!crypto_simd_usable() || (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm))) child = &ctx->cryptd_tfm->base; else child = cryptd_skcipher_child(ctx->cryptd_tfm); skcipher_request_set_tfm(subreq, child); return crypto_skcipher_encrypt(subreq); } static int simd_skcipher_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_request *subreq; struct crypto_skcipher *child; subreq = skcipher_request_ctx(req); *subreq = *req; if (!crypto_simd_usable() || (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm))) child = &ctx->cryptd_tfm->base; else child = cryptd_skcipher_child(ctx->cryptd_tfm); skcipher_request_set_tfm(subreq, child); return crypto_skcipher_decrypt(subreq); } static void simd_skcipher_exit(struct crypto_skcipher *tfm) { struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); cryptd_free_skcipher(ctx->cryptd_tfm); } static int simd_skcipher_init(struct crypto_skcipher *tfm) { struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct cryptd_skcipher *cryptd_tfm; struct simd_skcipher_alg *salg; struct skcipher_alg *alg; unsigned reqsize; alg = crypto_skcipher_alg(tfm); salg = container_of(alg, struct simd_skcipher_alg, alg); cryptd_tfm = cryptd_alloc_skcipher(salg->ialg_name, CRYPTO_ALG_INTERNAL, CRYPTO_ALG_INTERNAL); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); ctx->cryptd_tfm = cryptd_tfm; reqsize = crypto_skcipher_reqsize(cryptd_skcipher_child(cryptd_tfm)); reqsize = max(reqsize, crypto_skcipher_reqsize(&cryptd_tfm->base)); reqsize += sizeof(struct skcipher_request); crypto_skcipher_set_reqsize(tfm, reqsize); return 0; } struct simd_skcipher_alg *simd_skcipher_create_compat(struct skcipher_alg *ialg, const char *algname, const char *drvname, const char *basename) { struct simd_skcipher_alg *salg; struct skcipher_alg *alg; int err; salg = kzalloc(sizeof(*salg), GFP_KERNEL); if (!salg) { salg = ERR_PTR(-ENOMEM); goto out; } salg->ialg_name = basename; alg = &salg->alg; err = -ENAMETOOLONG; if (snprintf(alg->base.cra_name, CRYPTO_MAX_ALG_NAME, "%s", algname) >= CRYPTO_MAX_ALG_NAME) goto out_free_salg; if (snprintf(alg->base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", drvname) >= CRYPTO_MAX_ALG_NAME) goto out_free_salg; alg->base.cra_flags = CRYPTO_ALG_ASYNC | (ialg->base.cra_flags & CRYPTO_ALG_INHERITED_FLAGS); alg->base.cra_priority = ialg->base.cra_priority; alg->base.cra_blocksize = ialg->base.cra_blocksize; alg->base.cra_alignmask = ialg->base.cra_alignmask; alg->base.cra_module = ialg->base.cra_module; alg->base.cra_ctxsize = sizeof(struct simd_skcipher_ctx); alg->ivsize = ialg->ivsize; alg->chunksize = ialg->chunksize; alg->min_keysize = ialg->min_keysize; alg->max_keysize = ialg->max_keysize; alg->init = simd_skcipher_init; alg->exit = simd_skcipher_exit; alg->setkey = simd_skcipher_setkey; alg->encrypt = simd_skcipher_encrypt; alg->decrypt = simd_skcipher_decrypt; err = crypto_register_skcipher(alg); if (err) goto out_free_salg; out: return salg; out_free_salg: kfree(salg); salg = ERR_PTR(err); goto out; } EXPORT_SYMBOL_GPL(simd_skcipher_create_compat); void simd_skcipher_free(struct simd_skcipher_alg *salg) { crypto_unregister_skcipher(&salg->alg); kfree(salg); } EXPORT_SYMBOL_GPL(simd_skcipher_free); int simd_register_skciphers_compat(struct skcipher_alg *algs, int count, struct simd_skcipher_alg **simd_algs) { int err; int i; const char *algname; const char *drvname; const char *basename; struct simd_skcipher_alg *simd; err = crypto_register_skciphers(algs, count); if (err) return err; for (i = 0; i < count; i++) { WARN_ON(strncmp(algs[i].base.cra_name, "__", 2)); WARN_ON(strncmp(algs[i].base.cra_driver_name, "__", 2)); algname = algs[i].base.cra_name + 2; drvname = algs[i].base.cra_driver_name + 2; basename = algs[i].base.cra_driver_name; simd = simd_skcipher_create_compat(algs + i, algname, drvname, basename); err = PTR_ERR(simd); if (IS_ERR(simd)) goto err_unregister; simd_algs[i] = simd; } return 0; err_unregister: simd_unregister_skciphers(algs, count, simd_algs); return err; } EXPORT_SYMBOL_GPL(simd_register_skciphers_compat); void simd_unregister_skciphers(struct skcipher_alg *algs, int count, struct simd_skcipher_alg **simd_algs) { int i; crypto_unregister_skciphers(algs, count); for (i = 0; i < count; i++) { if (simd_algs[i]) { simd_skcipher_free(simd_algs[i]); simd_algs[i] = NULL; } } } EXPORT_SYMBOL_GPL(simd_unregister_skciphers); /* AEAD support */ struct simd_aead_alg { const char *ialg_name; struct aead_alg alg; }; struct simd_aead_ctx { struct cryptd_aead *cryptd_tfm; }; static int simd_aead_setkey(struct crypto_aead *tfm, const u8 *key, unsigned int key_len) { struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *child = &ctx->cryptd_tfm->base; crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_aead_setkey(child, key, key_len); } static int simd_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *child = &ctx->cryptd_tfm->base; return crypto_aead_setauthsize(child, authsize); } static int simd_aead_encrypt(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct aead_request *subreq; struct crypto_aead *child; subreq = aead_request_ctx(req); *subreq = *req; if (!crypto_simd_usable() || (in_atomic() && cryptd_aead_queued(ctx->cryptd_tfm))) child = &ctx->cryptd_tfm->base; else child = cryptd_aead_child(ctx->cryptd_tfm); aead_request_set_tfm(subreq, child); return crypto_aead_encrypt(subreq); } static int simd_aead_decrypt(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct aead_request *subreq; struct crypto_aead *child; subreq = aead_request_ctx(req); *subreq = *req; if (!crypto_simd_usable() || (in_atomic() && cryptd_aead_queued(ctx->cryptd_tfm))) child = &ctx->cryptd_tfm->base; else child = cryptd_aead_child(ctx->cryptd_tfm); aead_request_set_tfm(subreq, child); return crypto_aead_decrypt(subreq); } static void simd_aead_exit(struct crypto_aead *tfm) { struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); cryptd_free_aead(ctx->cryptd_tfm); } static int simd_aead_init(struct crypto_aead *tfm) { struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct cryptd_aead *cryptd_tfm; struct simd_aead_alg *salg; struct aead_alg *alg; unsigned reqsize; alg = crypto_aead_alg(tfm); salg = container_of(alg, struct simd_aead_alg, alg); cryptd_tfm = cryptd_alloc_aead(salg->ialg_name, CRYPTO_ALG_INTERNAL, CRYPTO_ALG_INTERNAL); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); ctx->cryptd_tfm = cryptd_tfm; reqsize = crypto_aead_reqsize(cryptd_aead_child(cryptd_tfm)); reqsize = max(reqsize, crypto_aead_reqsize(&cryptd_tfm->base)); reqsize += sizeof(struct aead_request); crypto_aead_set_reqsize(tfm, reqsize); return 0; } static struct simd_aead_alg *simd_aead_create_compat(struct aead_alg *ialg, const char *algname, const char *drvname, const char *basename) { struct simd_aead_alg *salg; struct aead_alg *alg; int err; salg = kzalloc(sizeof(*salg), GFP_KERNEL); if (!salg) { salg = ERR_PTR(-ENOMEM); goto out; } salg->ialg_name = basename; alg = &salg->alg; err = -ENAMETOOLONG; if (snprintf(alg->base.cra_name, CRYPTO_MAX_ALG_NAME, "%s", algname) >= CRYPTO_MAX_ALG_NAME) goto out_free_salg; if (snprintf(alg->base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", drvname) >= CRYPTO_MAX_ALG_NAME) goto out_free_salg; alg->base.cra_flags = CRYPTO_ALG_ASYNC | (ialg->base.cra_flags & CRYPTO_ALG_INHERITED_FLAGS); alg->base.cra_priority = ialg->base.cra_priority; alg->base.cra_blocksize = ialg->base.cra_blocksize; alg->base.cra_alignmask = ialg->base.cra_alignmask; alg->base.cra_module = ialg->base.cra_module; alg->base.cra_ctxsize = sizeof(struct simd_aead_ctx); alg->ivsize = ialg->ivsize; alg->maxauthsize = ialg->maxauthsize; alg->chunksize = ialg->chunksize; alg->init = simd_aead_init; alg->exit = simd_aead_exit; alg->setkey = simd_aead_setkey; alg->setauthsize = simd_aead_setauthsize; alg->encrypt = simd_aead_encrypt; alg->decrypt = simd_aead_decrypt; err = crypto_register_aead(alg); if (err) goto out_free_salg; out: return salg; out_free_salg: kfree(salg); salg = ERR_PTR(err); goto out; } static void simd_aead_free(struct simd_aead_alg *salg) { crypto_unregister_aead(&salg->alg); kfree(salg); } int simd_register_aeads_compat(struct aead_alg *algs, int count, struct simd_aead_alg **simd_algs) { int err; int i; const char *algname; const char *drvname; const char *basename; struct simd_aead_alg *simd; err = crypto_register_aeads(algs, count); if (err) return err; for (i = 0; i < count; i++) { WARN_ON(strncmp(algs[i].base.cra_name, "__", 2)); WARN_ON(strncmp(algs[i].base.cra_driver_name, "__", 2)); algname = algs[i].base.cra_name + 2; drvname = algs[i].base.cra_driver_name + 2; basename = algs[i].base.cra_driver_name; simd = simd_aead_create_compat(algs + i, algname, drvname, basename); err = PTR_ERR(simd); if (IS_ERR(simd)) goto err_unregister; simd_algs[i] = simd; } return 0; err_unregister: simd_unregister_aeads(algs, count, simd_algs); return err; } EXPORT_SYMBOL_GPL(simd_register_aeads_compat); void simd_unregister_aeads(struct aead_alg *algs, int count, struct simd_aead_alg **simd_algs) { int i; crypto_unregister_aeads(algs, count); for (i = 0; i < count; i++) { if (simd_algs[i]) { simd_aead_free(simd_algs[i]); simd_algs[i] = NULL; } } } EXPORT_SYMBOL_GPL(simd_unregister_aeads); MODULE_DESCRIPTION("Shared crypto SIMD helpers"); MODULE_LICENSE("GPL"); |
| 19 57 385 385 20 20 778 7 774 2 2 115 116 2 2 2 408 55 1 3 391 2 385 186 385 384 393 390 49 50 21 21 570 275 384 27 304 304 304 90 90 8 250 497 492 363 363 361 422 423 424 14 14 14 114 114 114 114 114 113 114 164 86 163 20 2 125 41 1 41 41 80 105 80 70 36 231 232 246 235 10 35 3 36 38 3 44 1 10 2 119 120 120 76 76 18 58 76 76 238 50 233 129 129 129 8 120 121 120 120 4 144 36 41 41 12 250 250 10 10 47 47 46 47 47 47 51 50 51 51 50 51 50 49 51 51 51 51 50 10 47 398 115 328 218 218 217 21 25 25 1 1 25 24 24 106 96 96 105 64 39 3 89 89 17 85 89 1 86 88 5 6 2 89 88 88 7 1 5 1 1 7 87 88 9 8 1 79 6 1 80 86 6 6 80 85 446 446 446 446 447 446 445 446 446 444 447 447 447 448 447 446 448 1 447 1 446 1 447 448 444 448 448 448 100 1 448 444 446 448 446 96 96 11 95 9 90 90 12 90 12 2 10 9 89 7 84 122 67 41 73 33 4 5 66 67 67 66 67 66 1 67 67 41 27 41 27 59 8 67 64 2 1 64 2 65 154 153 155 155 156 103 106 106 1 4 1 100 5 96 1 5 6 13 72 7 1 2 1 73 73 73 5 18 50 2 5 52 13 15 39 5 5 5 5 5 108 108 107 108 107 367 368 250 368 369 69 320 367 368 249 366 368 367 367 319 319 319 320 319 319 97 16 97 17 97 368 133 250 48 320 367 369 5 368 368 368 35 28 7 4 35 28 20 20 13 7 8 11 1 20 20 20 20 576 575 69 179 239 239 82 83 7 12 7 11 11 11 12 12 12 12 43 27 3 27 43 46 46 4 4 46 12 4 4 43 43 2 43 43 12 12 1 1 1 12 12 12 12 124 1 6 119 155 155 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/radix-tree.h> #include <linux/writeback.h> #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/slab.h> #include <linux/migrate.h> #include <linux/ratelimit.h> #include <linux/uuid.h> #include <linux/semaphore.h> #include <linux/error-injection.h> #include <linux/crc32c.h> #include <linux/sched/mm.h> #include <linux/unaligned.h> #include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "bio.h" #include "print-tree.h" #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" #include "qgroup.h" #include "compression.h" #include "tree-checker.h" #include "ref-verify.h" #include "block-group.h" #include "discard.h" #include "space-info.h" #include "zoned.h" #include "subpage.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" #include "uuid-tree.h" #include "relocation.h" #include "scrub.h" #include "super.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ BTRFS_SUPER_FLAG_ERROR |\ BTRFS_SUPER_FLAG_SEEDING |\ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) { if (fs_info->csum_shash) crypto_free_shash(fs_info->csum_shash); } /* * Compute the csum of a btree block and store the result to provided buffer. */ static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; int num_pages; u32 first_page_part; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); if (buf->addr) { /* Pages are contiguous, handle them as a big one. */ kaddr = buf->addr; first_page_part = fs_info->nodesize; num_pages = 1; } else { kaddr = folio_address(buf->folios[0]); first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); num_pages = num_extent_pages(buf); } crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, first_page_part - BTRFS_CSUM_SIZE); /* * Multiple single-page folios case would reach here. * * nodesize <= PAGE_SIZE and large folio all handled by above * crypto_shash_update() already. */ for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { kaddr = folio_address(buf->folios[i]); crypto_shash_update(shash, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); crypto_shash_final(shash, result); } /* * we can't consider a given block up to date unless the transid of the * block matches the transid in the parent node's pointer. This is how we * detect blocks that either didn't get written at all or got written * in the wrong place. */ int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic) { if (!extent_buffer_uptodate(eb)) return 0; if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 1; if (atomic) return -EAGAIN; if (!extent_buffer_uptodate(eb) || btrfs_header_generation(eb) != parent_transid) { btrfs_err_rl(eb->fs_info, "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", eb->start, eb->read_mirror, parent_transid, btrfs_header_generation(eb)); clear_extent_buffer_uptodate(eb); return 0; } return 1; } static bool btrfs_supported_super_csum(u16 csum_type) { switch (csum_type) { case BTRFS_CSUM_TYPE_CRC32: case BTRFS_CSUM_TYPE_XXHASH: case BTRFS_CSUM_TYPE_SHA256: case BTRFS_CSUM_TYPE_BLAKE2: return true; default: return false; } } /* * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. */ int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb) { char result[BTRFS_CSUM_SIZE]; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); shash->tfm = fs_info->csum_shash; /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); if (memcmp(disk_sb->csum, result, fs_info->csum_size)) return 1; return 0; } static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, folio_pos(folio) + eb->folio_size); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, start, folio, offset_in_folio(folio, start), mirror_num); if (ret) break; } return ret; } /* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. * * @check: expected tree parentness check, see the comments of the * structure for details. */ int btrfs_read_extent_buffer(struct extent_buffer *eb, const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int failed = 0; int ret; int num_copies = 0; int mirror_num = 0; int failed_mirror = 0; ASSERT(check); while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = read_extent_buffer_pages(eb, mirror_num, check); if (!ret) break; num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); if (num_copies == 1) break; if (!failed_mirror) { failed = 1; failed_mirror = eb->read_mirror; } mirror_num++; if (mirror_num == failed_mirror) mirror_num++; if (mirror_num > num_copies) break; } if (failed && !ret && failed_mirror) btrfs_repair_eb_io_failure(eb, failed_mirror); return ret; } /* * Checksum a dirty tree block before IO. */ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start = btrfs_header_bytenr(eb); u64 last_trans; u8 result[BTRFS_CSUM_SIZE]; int ret; /* Btree blocks are always contiguous on disk. */ if (WARN_ON_ONCE(bbio->file_offset != eb->start)) return BLK_STS_IOERR; if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) return BLK_STS_IOERR; /* * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't * checksum it but zero-out its content. This is done to preserve * ordering of I/O without unnecessarily writing out data. */ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { memzero_extent_buffer(eb, 0, eb->len); return BLK_STS_OK; } if (WARN_ON_ONCE(found_start != eb->start)) return BLK_STS_IOERR; if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb))) return BLK_STS_IOERR; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE) == 0); csum_tree_block(eb, result); if (btrfs_header_level(eb)) ret = btrfs_check_node(eb); else ret = btrfs_check_leaf(eb); if (ret < 0) goto error; /* * Also check the generation, the eb reached here must be newer than * last committed. Or something seriously wrong happened. */ last_trans = btrfs_get_last_trans_committed(fs_info); if (unlikely(btrfs_header_generation(eb) <= last_trans)) { ret = -EUCLEAN; btrfs_err(fs_info, "block=%llu bad generation, have %llu expect > %llu", eb->start, btrfs_header_generation(eb), last_trans); goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); return BLK_STS_OK; error: btrfs_print_tree(eb, 0); btrfs_err(fs_info, "block=%llu write time tree block corruption detected", eb->start); /* * Be noisy if this is an extent buffer from a log tree. We don't abort * a transaction in case there's a bad log tree extent buffer, we just * fallback to a transaction commit. Still we want to know when there is * a bad log tree extent buffer, as that may signal a bug somewhere. */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); return errno_to_blk_status(ret); } static bool check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); /* * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid. * This is then overwritten by metadata_uuid if it is present in the * device_list_add(). The same true for a seed device as well. So use of * fs_devices::metadata_uuid is appropriate here. */ if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0) return false; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) return false; return true; } /* Do basic extent buffer checks at read time */ int btrfs_validate_extent_buffer(struct extent_buffer *eb, const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; const u32 csum_size = fs_info->csum_size; u8 found_level; u8 result[BTRFS_CSUM_SIZE]; const u8 *header_csum; int ret = 0; const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS); ASSERT(check); found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { btrfs_err_rl(fs_info, "bad tree block start, mirror %u want %llu have %llu", eb->read_mirror, eb->start, found_start); ret = -EIO; goto out; } if (check_tree_block_fsid(eb)) { btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", eb->start, eb->read_mirror); ret = -EIO; goto out; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "bad tree block level, mirror %u level %d on logical %llu", eb->read_mirror, btrfs_header_level(eb), eb->start); ret = -EIO; goto out; } csum_tree_block(eb, result); header_csum = folio_address(eb->folios[0]) + get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum)); if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, "checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s", eb->start, eb->read_mirror, CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb), ignore_csum ? ", ignored" : ""); if (!ignore_csum) { ret = -EUCLEAN; goto out; } } if (found_level != check->level) { btrfs_err(fs_info, "level verify failed on logical %llu mirror %u wanted %u found %u", eb->start, eb->read_mirror, check->level, found_level); ret = -EIO; goto out; } if (unlikely(check->transid && btrfs_header_generation(eb) != check->transid)) { btrfs_err_rl(eb->fs_info, "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", eb->start, eb->read_mirror, check->transid, btrfs_header_generation(eb)); ret = -EIO; goto out; } if (check->has_first_key) { const struct btrfs_key *expect_key = &check->first_key; struct btrfs_key found_key; if (found_level) btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) { btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", eb->start, check->transid, expect_key->objectid, expect_key->type, expect_key->offset, found_key.objectid, found_key.type, found_key.offset); ret = -EUCLEAN; goto out; } } if (check->owner_root) { ret = btrfs_check_eb_owner(eb, check->owner_root); if (ret < 0) goto out; } /* * If this is a leaf block and it is corrupt, set the corrupt bit so * that we don't try and read the other copies of this block, just * return -EIO. */ if (found_level == 0 && btrfs_check_leaf(eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; if (ret) btrfs_err(fs_info, "read time tree block corruption detected on logical %llu mirror %u", eb->start, eb->read_mirror); out: return ret; } #ifdef CONFIG_MIGRATION static int btree_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { /* * we can't safely write a btree page from here, * we haven't done the locking hook */ if (folio_test_dirty(src)) return -EAGAIN; /* * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ if (folio_get_private(src) && !filemap_release_folio(src, GFP_KERNEL)) return -EAGAIN; return migrate_folio(mapping, dst, src, mode); } #else #define btree_migrate_folio NULL #endif static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; if (wbc->sync_mode == WB_SYNC_NONE) { struct btrfs_fs_info *fs_info; if (wbc->for_kupdate) return 0; fs_info = inode_to_fs_info(mapping->host); /* this is a bit racy, but that's ok */ ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, BTRFS_DIRTY_METADATA_THRESH, fs_info->dirty_metadata_batch); if (ret < 0) return 0; } return btree_write_cache_pages(mapping, wbc); } static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) { if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; return try_release_extent_buffer(folio); } static void btree_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct extent_io_tree *tree; tree = &folio_to_inode(folio)->io_tree; extent_invalidate_folio(tree, folio, offset); btree_release_folio(folio, GFP_NOFS); if (folio_get_private(folio)) { btrfs_warn(folio_to_fs_info(folio), "folio private not zero on folio %llu", (unsigned long long)folio_pos(folio)); folio_detach_private(folio); } } #ifdef DEBUG static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; u64 page_start = folio_pos(folio); if (fs_info->sectorsize == PAGE_SIZE) { eb = folio_get_private(folio); BUG_ON(!eb); BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_write_locked(eb); return filemap_dirty_folio(mapping, folio); } ASSERT(spi); subpage = folio_get_private(folio); for (cur_bit = spi->dirty_offset; cur_bit < spi->dirty_offset + spi->bitmap_nr_bits; cur_bit++) { unsigned long flags; u64 cur; spin_lock_irqsave(&subpage->lock, flags); if (!test_bit(cur_bit, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); continue; } spin_unlock_irqrestore(&subpage->lock, flags); cur = page_start + cur_bit * fs_info->sectorsize; eb = find_extent_buffer(fs_info, cur); ASSERT(eb); ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); ASSERT(atomic_read(&eb->refs)); btrfs_assert_tree_write_locked(eb); free_extent_buffer(eb); cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1; } return filemap_dirty_folio(mapping, folio); } #else #define btree_dirty_folio filemap_dirty_folio #endif static const struct address_space_operations btree_aops = { .writepages = btree_writepages, .release_folio = btree_release_folio, .invalidate_folio = btree_invalidate_folio, .migrate_folio = btree_migrate_folio, .dirty_folio = btree_dirty_folio, }; struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level) { if (btrfs_is_testing(fs_info)) return alloc_test_extent_buffer(fs_info, bytenr); return alloc_extent_buffer(fs_info, bytenr, owner_root, level); } /* * Read tree block at logical address @bytenr and do variant basic but critical * verification. * * @check: expected tree parentness check, see comments of the * structure for details. */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct btrfs_tree_parent_check *check) { struct extent_buffer *buf = NULL; int ret; ASSERT(check); buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root, check->level); if (IS_ERR(buf)) return buf; ret = btrfs_read_extent_buffer(buf, check); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } return buf; } static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { bool dummy = btrfs_is_testing(fs_info); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); root->fs_info = fs_info; root->root_key.objectid = objectid; root->node = NULL; root->commit_root = NULL; root->state = 0; RB_CLEAR_NODE(&root->rb_node); btrfs_set_root_last_trans(root, 0); root->free_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; xa_init(&root->inodes); xa_init(&root->delayed_nodes); btrfs_init_root_block_rsv(root); INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); INIT_LIST_HEAD(&root->delalloc_inodes); INIT_LIST_HEAD(&root->delalloc_root); INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->reloc_dirty_list); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); spin_lock_init(&root->qgroup_meta_rsv_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); mutex_init(&root->delalloc_mutex); init_waitqueue_head(&root->qgroup_flush_wait); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); INIT_LIST_HEAD(&root->log_ctxs[0]); INIT_LIST_HEAD(&root->log_ctxs[1]); atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); refcount_set(&root->refs, 1); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES); extent_io_tree_init(fs_info, &root->log_csum_range, IO_TREE_LOG_CSUM_RANGE); } spin_lock_init(&root->root_item_lock); btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&root->leak_list); spin_lock(&fs_info->fs_roots_radix_lock); list_add_tail(&root->leak_list, &fs_info->allocated_roots); spin_unlock(&fs_info->fs_roots_radix_lock); #endif } static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, u64 objectid, gfp_t flags) { struct btrfs_root *root = kzalloc(sizeof(*root), flags); if (root) __setup_root(root, fs_info, objectid); return root; } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; if (!fs_info) return ERR_PTR(-EINVAL); root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); /* We don't use the stripesize in selftest, set it as sectorsize */ root->alloc_bytenr = 0; return root; } #endif static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node) { const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node); const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node); return btrfs_comp_cpu_keys(&a->root_key, &b->root_key); } static int global_root_key_cmp(const void *k, const struct rb_node *node) { const struct btrfs_key *key = k; const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node); return btrfs_comp_cpu_keys(key, &root->root_key); } int btrfs_global_root_insert(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *tmp; int ret = 0; write_lock(&fs_info->global_root_lock); tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp); write_unlock(&fs_info->global_root_lock); if (tmp) { ret = -EEXIST; btrfs_warn(fs_info, "global root %llu %llu already exists", btrfs_root_id(root), root->root_key.offset); } return ret; } void btrfs_global_root_delete(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; write_lock(&fs_info->global_root_lock); rb_erase(&root->rb_node, &fs_info->global_root_tree); write_unlock(&fs_info->global_root_lock); } struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key) { struct rb_node *node; struct btrfs_root *root = NULL; read_lock(&fs_info->global_root_lock); node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp); if (node) root = container_of(node, struct btrfs_root, rb_node); read_unlock(&fs_info->global_root_lock); return root; } static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group *block_group; u64 ret; if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) return 0; if (bytenr) block_group = btrfs_lookup_block_group(fs_info, bytenr); else block_group = btrfs_lookup_first_block_group(fs_info, bytenr); ASSERT(block_group); if (!block_group) return 0; ret = block_group->global_root_id; btrfs_put_block_group(block_group); return ret; } struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_key key = { .objectid = BTRFS_CSUM_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); } struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_key key = { .objectid = BTRFS_EXTENT_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); } struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *leaf; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; struct btrfs_key key; unsigned int nofs_flag; int ret = 0; /* * We're holding a transaction handle, so use a NOFS memory allocation * context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!root) return ERR_PTR(-ENOMEM); root->root_key.objectid = objectid; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; goto fail; } root->node = leaf; btrfs_mark_buffer_dirty(trans, leaf); root->commit_root = btrfs_root_node(root); set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); btrfs_set_root_flags(&root->root_item, 0); btrfs_set_root_limit(&root->root_item, 0); btrfs_set_root_bytenr(&root->root_item, leaf->start); btrfs_set_root_generation(&root->root_item, trans->transid); btrfs_set_root_level(&root->root_item, 0); btrfs_set_root_refs(&root->root_item, 1); btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); if (is_fstree(objectid)) generate_random_guid(root->root_item.uuid); else export_guid(root->root_item.uuid, &guid_null); btrfs_set_root_drop_level(&root->root_item, 0); btrfs_tree_unlock(leaf); key.objectid = objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = 0; ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); if (ret) goto fail; return root; fail: btrfs_put_root(root); return ERR_PTR(ret); } static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; return root; } int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct extent_buffer *leaf; /* * DON'T set SHAREABLE bit for log trees. * * Log trees are not exposed to user space thus can't be snapshotted, * and they go away before a real commit is actually done. * * They do store pointers to file data extents, and those reference * counts still get updated (along with back refs to the log tree). */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) return PTR_ERR(leaf); root->node = leaf; btrfs_mark_buffer_dirty(trans, root->node); btrfs_tree_unlock(root->node); return 0; } int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *log_root; log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); if (!btrfs_is_zoned(fs_info)) { int ret = btrfs_alloc_log_tree_node(trans, log_root); if (ret) { btrfs_put_root(log_root); return ret; } } WARN_ON(fs_info->log_root_tree); fs_info->log_root_tree = log_root; return 0; } int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log_root; struct btrfs_inode_item *inode_item; int ret; log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); ret = btrfs_alloc_log_tree_node(trans, log_root); if (ret) { btrfs_put_root(log_root); return ret; } btrfs_set_root_last_trans(log_root, trans->transid); log_root->root_key.offset = btrfs_root_id(root); inode_item = &log_root->root_item.inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); btrfs_set_stack_inode_nbytes(inode_item, fs_info->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_node(&log_root->root_item, log_root->node); WARN_ON(root->log_root); root->log_root = log_root; btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; btrfs_set_root_last_log_commit(root, 0); return 0; } static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, struct btrfs_path *path, const struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_fs_info *fs_info = tree_root->fs_info; u64 generation; int ret; int level; root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); if (ret) { if (ret > 0) ret = -ENOENT; goto fail; } generation = btrfs_root_generation(&root->root_item); level = btrfs_root_level(&root->root_item); check.level = level; check.transid = generation; check.owner_root = key->objectid; root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; goto fail; } if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; goto fail; } /* * For real fs, and not log/reloc trees, root owner must * match its root node owner */ if (!btrfs_is_testing(fs_info) && btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && btrfs_root_id(root) != btrfs_header_owner(root->node)) { btrfs_crit(fs_info, "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", btrfs_root_id(root), root->node->start, btrfs_header_owner(root->node), btrfs_root_id(root)); ret = -EUCLEAN; goto fail; } root->commit_root = btrfs_root_node(root); return root; fail: btrfs_put_root(root); return ERR_PTR(ret); } struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key) { struct btrfs_root *root; BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); root = read_tree_root_path(tree_root, path, key); return root; } /* * Initialize subvolume root in-memory structure. * * @anon_dev: anonymous device to attach to the root, if zero, allocate new * * In case of failure the caller is responsible to call btrfs_free_fs_root() */ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { int ret; btrfs_drew_lock_init(&root->snapshot_lock); if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root) && is_fstree(btrfs_root_id(root))) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } /* * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M */ if (is_fstree(btrfs_root_id(root)) && btrfs_root_refs(&root->root_item) > 0) { if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); if (ret) return ret; } else { root->anon_dev = anon_dev; } } mutex_lock(&root->objectid_mutex); ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); return ret; } ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); mutex_unlock(&root->objectid_mutex); return 0; } static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, u64 root_id) { struct btrfs_root *root; spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); root = btrfs_grab_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, u64 objectid) { struct btrfs_key key = { .objectid = objectid, .type = BTRFS_ROOT_ITEM_KEY, .offset = 0, }; switch (objectid) { case BTRFS_ROOT_TREE_OBJECTID: return btrfs_grab_root(fs_info->tree_root); case BTRFS_EXTENT_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); case BTRFS_CHUNK_TREE_OBJECTID: return btrfs_grab_root(fs_info->chunk_root); case BTRFS_DEV_TREE_OBJECTID: return btrfs_grab_root(fs_info->dev_root); case BTRFS_CSUM_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); case BTRFS_QUOTA_TREE_OBJECTID: return btrfs_grab_root(fs_info->quota_root); case BTRFS_UUID_TREE_OBJECTID: return btrfs_grab_root(fs_info->uuid_root); case BTRFS_BLOCK_GROUP_TREE_OBJECTID: return btrfs_grab_root(fs_info->block_group_root); case BTRFS_FREE_SPACE_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); case BTRFS_RAID_STRIPE_TREE_OBJECTID: return btrfs_grab_root(fs_info->stripe_root); default: return NULL; } } int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { int ret; ret = radix_tree_preload(GFP_NOFS); if (ret) return ret; spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)btrfs_root_id(root), root); if (ret == 0) { btrfs_grab_root(root); set_bit(BTRFS_ROOT_IN_RADIX, &root->state); } spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); return ret; } void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_DEBUG struct btrfs_root *root; while (!list_empty(&fs_info->allocated_roots)) { char buf[BTRFS_ROOT_NAME_BUF_LEN]; root = list_first_entry(&fs_info->allocated_roots, struct btrfs_root, leak_list); btrfs_err(fs_info, "leaked root %s refcount %d", btrfs_root_name(&root->root_key, buf), refcount_read(&root->refs)); WARN_ON_ONCE(1); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); btrfs_put_root(root); } #endif } static void free_global_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; struct rb_node *node; while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) { root = rb_entry(node, struct btrfs_root, rb_node); rb_erase(&root->rb_node, &fs_info->global_root_tree); btrfs_put_root(root); } } void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); if (percpu_counter_initialized(em_counter)) ASSERT(percpu_counter_sum_positive(em_counter) == 0); percpu_counter_destroy(em_counter); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); free_global_roots(fs_info); btrfs_put_root(fs_info->tree_root); btrfs_put_root(fs_info->chunk_root); btrfs_put_root(fs_info->dev_root); btrfs_put_root(fs_info->quota_root); btrfs_put_root(fs_info->uuid_root); btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); btrfs_put_root(fs_info->block_group_root); btrfs_put_root(fs_info->stripe_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kvfree(fs_info); } /* * Get an in-memory reference of a root structure. * * For essential trees like root/extent tree, we grab it from fs_info directly. * For subvolume trees, we check the cached filesystem roots first. If not * found, then read it from disk and add it to cached fs roots. * * Caller should release the root by calling btrfs_put_root() after the usage. * * NOTE: Reloc and log trees can't be read by this function as they share the * same root objectid. * * @objectid: root id * @anon_dev: preallocated anonymous block device number for new roots, * pass NULL for a new allocation. * @check_ref: whether to check root item references, If true, return -ENOENT * for orphan roots */ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, u64 objectid, dev_t *anon_dev, bool check_ref) { struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; int ret; root = btrfs_get_global_root(fs_info, objectid); if (root) return root; /* * If we're called for non-subvolume trees, and above function didn't * find one, do not try to read it from disk. * * This is namely for free-space-tree and quota tree, which can change * at runtime and should only be grabbed from fs_info. */ if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) return ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { /* * Some other caller may have read out the newly inserted * subvolume already (for things like backref walk etc). Not * that common but still possible. In that case, we just need * to free the anon_dev. */ if (unlikely(anon_dev && *anon_dev)) { free_anon_bdev(*anon_dev); *anon_dev = 0; } if (check_ref && btrfs_root_refs(&root->root_item) == 0) { btrfs_put_root(root); return ERR_PTR(-ENOENT); } return root; } key.objectid = objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(root)) return root; if (check_ref && btrfs_root_refs(&root->root_item) == 0) { ret = -ENOENT; goto fail; } ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0); if (ret) goto fail; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto fail; } key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); btrfs_free_path(path); if (ret < 0) goto fail; if (ret == 0) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { if (ret == -EEXIST) { btrfs_put_root(root); goto again; } goto fail; } return root; fail: /* * If our caller provided us an anonymous device, then it's his * responsibility to free it in case we fail. So we have to set our * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() * and once again by our caller. */ if (anon_dev && *anon_dev) root->anon_dev = 0; btrfs_put_root(root); return ERR_PTR(ret); } /* * Get in-memory reference of a root structure * * @objectid: tree objectid * @check_ref: if set, verify that the tree exists and the item has at least * one reference */ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref) { return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref); } /* * Get in-memory reference of a root structure, created as new, optionally pass * the anonymous block device id * * @objectid: tree objectid * @anon_dev: if NULL, allocate a new anonymous block device or use the * parameter value if not NULL */ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, dev_t *anon_dev) { return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); } /* * Return a root for the given objectid. * * @fs_info: the fs_info * @objectid: the objectid we need to lookup * * This is exclusively used for backref walking, and exists specifically because * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref * creation time, which means we may have to read the tree_root in order to look * up a fs root that is not in memory. If the root is not in memory we will * read the tree root commit root and look up the fs root from there. This is a * temporary root, it will not be inserted into the radix tree as it doesn't * have the most uptodate information, it'll simply be discarded once the * backref code is finished using the root. */ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid) { struct btrfs_root *root; struct btrfs_key key; ASSERT(path->search_commit_root && path->skip_locking); /* * This can return -ENOENT if we ask for a root that doesn't exist, but * since this is called via the backref walking code we won't be looking * up a root that doesn't exist, unless there's corruption. So if root * != NULL just return it. */ root = btrfs_get_global_root(fs_info, objectid); if (root) return root; root = btrfs_lookup_fs_root(fs_info, objectid); if (root) return root; key.objectid = objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; root = read_tree_root_path(fs_info->tree_root, path, &key); btrfs_release_path(path); return root; } static int cleaner_kthread(void *arg) { struct btrfs_fs_info *fs_info = arg; int again; while (1) { again = 0; set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); /* Make the cleaner go to sleep early. */ if (btrfs_need_cleaner_sleep(fs_info)) goto sleep; /* * Do not do anything if we might cause open_ctree() to block * before we have finished mounting the filesystem. */ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) goto sleep; if (!mutex_trylock(&fs_info->cleaner_mutex)) goto sleep; /* * Avoid the problem that we change the status of the fs * during the above check and trylock. */ if (btrfs_need_cleaner_sleep(fs_info)) { mutex_unlock(&fs_info->cleaner_mutex); goto sleep; } if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags)) btrfs_sysfs_feature_update(fs_info); btrfs_run_delayed_iputs(fs_info); again = btrfs_clean_one_deleted_snapshot(fs_info); mutex_unlock(&fs_info->cleaner_mutex); /* * The defragger has dealt with the R/O remount and umount, * needn't do anything special here. */ btrfs_run_defrag_inodes(fs_info); /* * Acquires fs_info->reclaim_bgs_lock to avoid racing * with relocation (btrfs_relocate_chunk) and relocation * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) * after acquiring fs_info->reclaim_bgs_lock. So we * can't hold, nor need to, fs_info->cleaner_mutex when deleting * unused block groups. */ btrfs_delete_unused_bgs(fs_info); /* * Reclaim block groups in the reclaim_bgs list after we deleted * all unused block_groups. This possibly gives us some more free * space. */ btrfs_reclaim_bgs(fs_info); sleep: clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) kthread_parkme(); if (kthread_should_stop()) return 0; if (!again) { set_current_state(TASK_INTERRUPTIBLE); schedule(); __set_current_state(TASK_RUNNING); } } } static int transaction_kthread(void *arg) { struct btrfs_root *root = arg; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; u64 transid; time64_t delta; unsigned long delay; bool cannot_commit; do { cannot_commit = false; delay = secs_to_jiffies(fs_info->commit_interval); mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); cur = fs_info->running_transaction; if (!cur) { spin_unlock(&fs_info->trans_lock); goto sleep; } delta = ktime_get_seconds() - cur->start_time; if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay -= secs_to_jiffies(delta - 1); delay = min(delay, secs_to_jiffies(fs_info->commit_interval)); goto sleep; } transid = cur->transid; spin_unlock(&fs_info->trans_lock); /* If the file system is aborted, this will always fail. */ trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) cannot_commit = true; goto sleep; } if (transid == trans->transid) { btrfs_commit_transaction(trans); } else { btrfs_end_transaction(trans); } sleep: wake_up_process(fs_info->cleaner_kthread); mutex_unlock(&fs_info->transaction_kthread_mutex); if (BTRFS_FS_ERROR(fs_info)) btrfs_cleanup_transaction(fs_info); if (!kthread_should_stop() && (!btrfs_transaction_blocked(fs_info) || cannot_commit)) schedule_timeout_interruptible(delay); } while (!kthread_should_stop()); return 0; } /* * This will find the highest generation in the array of root backups. The * index of the highest array is returned, or -EINVAL if we can't find * anything. * * We check to make sure the array is valid by comparing the * generation of the latest root in the array with the generation * in the super block. If they don't match we pitch it. */ static int find_newest_super_backup(struct btrfs_fs_info *info) { const u64 newest_gen = btrfs_super_generation(info->super_copy); u64 cur; struct btrfs_root_backup *root_backup; int i; for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { root_backup = info->super_copy->super_roots + i; cur = btrfs_backup_tree_root_gen(root_backup); if (cur == newest_gen) return i; } return -EINVAL; } /* * copy all the root pointers into the super backup array. * this will bump the backup pointer by one when it is * done */ static void backup_super_roots(struct btrfs_fs_info *info) { const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; root_backup = info->super_for_commit->super_roots + next_backup; /* * make sure all of our padding and empty slots get zero filled * regardless of which ones we use today */ memset(root_backup, 0, sizeof(*root_backup)); info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); btrfs_set_backup_tree_root_gen(root_backup, btrfs_header_generation(info->tree_root->node)); btrfs_set_backup_tree_root_level(root_backup, btrfs_header_level(info->tree_root->node)); btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); btrfs_set_backup_chunk_root_gen(root_backup, btrfs_header_generation(info->chunk_root->node)); btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) { struct btrfs_root *extent_root = btrfs_extent_root(info, 0); struct btrfs_root *csum_root = btrfs_csum_root(info, 0); btrfs_set_backup_extent_root(root_backup, extent_root->node->start); btrfs_set_backup_extent_root_gen(root_backup, btrfs_header_generation(extent_root->node)); btrfs_set_backup_extent_root_level(root_backup, btrfs_header_level(extent_root->node)); btrfs_set_backup_csum_root(root_backup, csum_root->node->start); btrfs_set_backup_csum_root_gen(root_backup, btrfs_header_generation(csum_root->node)); btrfs_set_backup_csum_root_level(root_backup, btrfs_header_level(csum_root->node)); } /* * we might commit during log recovery, which happens before we set * the fs_root. Make sure it is valid before we fill it in. */ if (info->fs_root && info->fs_root->node) { btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start); btrfs_set_backup_fs_root_gen(root_backup, btrfs_header_generation(info->fs_root->node)); btrfs_set_backup_fs_root_level(root_backup, btrfs_header_level(info->fs_root->node)); } btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); btrfs_set_backup_dev_root_gen(root_backup, btrfs_header_generation(info->dev_root->node)); btrfs_set_backup_dev_root_level(root_backup, btrfs_header_level(info->dev_root->node)); btrfs_set_backup_total_bytes(root_backup, btrfs_super_total_bytes(info->super_copy)); btrfs_set_backup_bytes_used(root_backup, btrfs_super_bytes_used(info->super_copy)); btrfs_set_backup_num_devices(root_backup, btrfs_super_num_devices(info->super_copy)); /* * if we don't copy this out to the super_copy, it won't get remembered * for the next commit */ memcpy(&info->super_copy->super_roots, &info->super_for_commit->super_roots, sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); } /* * Reads a backup root based on the passed priority. Prio 0 is the newest, prio * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots * * @fs_info: filesystem whose backup roots need to be read * @priority: priority of backup root required * * Returns backup root index on success and -EINVAL otherwise. */ static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority) { int backup_index = find_newest_super_backup(fs_info); struct btrfs_super_block *super = fs_info->super_copy; struct btrfs_root_backup *root_backup; if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) { if (priority == 0) return backup_index; backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority; backup_index %= BTRFS_NUM_BACKUP_ROOTS; } else { return -EINVAL; } root_backup = super->super_roots + backup_index; btrfs_set_super_generation(super, btrfs_backup_tree_root_gen(root_backup)); btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); btrfs_set_super_root_level(super, btrfs_backup_tree_root_level(root_backup)); btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); /* * Fixme: the total bytes and num_devices need to match or we should * need a fsck */ btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); return backup_index; } /* helper to cleanup workers */ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { btrfs_destroy_workqueue(fs_info->fixup_workers); btrfs_destroy_workqueue(fs_info->delalloc_workers); btrfs_destroy_workqueue(fs_info->workers); if (fs_info->endio_workers) destroy_workqueue(fs_info->endio_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); if (fs_info->compressed_write_workers) destroy_workqueue(fs_info->compressed_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); btrfs_destroy_workqueue(fs_info->caching_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); if (fs_info->discard_ctl.discard_workers) destroy_workqueue(fs_info->discard_ctl.discard_workers); /* * Now that all other work queues are destroyed, we can safely destroy * the queues used for metadata I/O, since tasks from those other work * queues can do metadata I/O operations. */ if (fs_info->endio_meta_workers) destroy_workqueue(fs_info->endio_meta_workers); } static void free_root_extent_buffers(struct btrfs_root *root) { if (root) { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); root->node = NULL; root->commit_root = NULL; } } static void free_global_root_pointers(struct btrfs_fs_info *fs_info) { struct btrfs_root *root, *tmp; rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree, rb_node) free_root_extent_buffers(root); } /* helper to cleanup tree roots */ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) { free_root_extent_buffers(info->tree_root); free_global_root_pointers(info); free_root_extent_buffers(info->dev_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); free_root_extent_buffers(info->block_group_root); free_root_extent_buffers(info->stripe_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } void btrfs_put_root(struct btrfs_root *root) { if (!root) return; if (refcount_dec_and_test(&root->refs)) { if (WARN_ON(!xa_empty(&root->inodes))) xa_destroy(&root->inodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG spin_lock(&root->fs_info->fs_roots_radix_lock); list_del_init(&root->leak_list); spin_unlock(&root->fs_info->fs_roots_radix_lock); #endif kfree(root); } } void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { int ret; struct btrfs_root *gang[8]; int i; while (!list_empty(&fs_info->dead_roots)) { gang[0] = list_entry(fs_info->dead_roots.next, struct btrfs_root, root_list); list_del(&gang[0]->root_list); if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) btrfs_drop_and_free_fs_root(fs_info, gang[0]); btrfs_put_root(gang[0]); } while (1) { ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, 0, ARRAY_SIZE(gang)); if (!ret) break; for (i = 0; i < ret; i++) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } } static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->scrub_lock); atomic_set(&fs_info->scrubs_running, 0); atomic_set(&fs_info->scrub_pause_req, 0); atomic_set(&fs_info->scrubs_paused, 0); atomic_set(&fs_info->scrub_cancel_req, 0); init_waitqueue_head(&fs_info->scrub_pause_wait); refcount_set(&fs_info->scrub_workers_refcnt, 0); } static void btrfs_init_balance(struct btrfs_fs_info *fs_info) { spin_lock_init(&fs_info->balance_lock); mutex_init(&fs_info->balance_mutex); atomic_set(&fs_info->balance_pause_req, 0); atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); atomic_set(&fs_info->reloc_cancel_req, 0); } static int btrfs_init_btree_inode(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID, fs_info->tree_root); struct inode *inode; inode = new_inode(sb); if (!inode) return -ENOMEM; btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID); set_nlink(inode, 1); /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of * the devices in the system */ inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &btree_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, IO_TREE_BTREE_INODE_IO); extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); __insert_inode_hash(inode, hash); fs_info->btree_inode = inode; return 0; } static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); init_rwsem(&fs_info->dev_replace.rwsem); init_waitqueue_head(&fs_info->dev_replace.replace_wait); } static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) { spin_lock_init(&fs_info->qgroup_lock); mutex_init(&fs_info->qgroup_ioctl_lock); fs_info->qgroup_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); } static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE; fs_info->workers = btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); fs_info->delalloc_workers = btrfs_alloc_workqueue(fs_info, "delalloc", flags, max_active, 2); fs_info->flush_workers = btrfs_alloc_workqueue(fs_info, "flush_delalloc", flags, max_active, 0); fs_info->caching_workers = btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); fs_info->fixup_workers = btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags); fs_info->endio_workers = alloc_workqueue("btrfs-endio", flags, max_active); fs_info->endio_meta_workers = alloc_workqueue("btrfs-endio-meta", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); fs_info->compressed_write_workers = alloc_workqueue("btrfs-compressed-write", flags, max_active); fs_info->endio_freespace_worker = btrfs_alloc_workqueue(fs_info, "freespace-write", flags, max_active, 0); fs_info->delayed_workers = btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, max_active, 0); fs_info->qgroup_rescan_workers = btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan", ordered_flags); fs_info->discard_ctl.discard_workers = alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->compressed_write_workers && fs_info->endio_write_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && fs_info->delayed_workers && fs_info->qgroup_rescan_workers && fs_info->discard_ctl.discard_workers)) { return -ENOMEM; } return 0; } static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) { struct crypto_shash *csum_shash; const char *csum_driver = btrfs_super_csum_driver(csum_type); csum_shash = crypto_alloc_shash(csum_driver, 0, 0); if (IS_ERR(csum_shash)) { btrfs_err(fs_info, "error allocating %s hash for checksum", csum_driver); return PTR_ERR(csum_shash); } fs_info->csum_shash = csum_shash; /* * Check if the checksum implementation is a fast accelerated one. * As-is this is a bit of a hack and should be replaced once the csum * implementations provide that information themselves. */ switch (csum_type) { case BTRFS_CSUM_TYPE_CRC32: if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); break; case BTRFS_CSUM_TYPE_XXHASH: set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); break; default: break; } btrfs_info(fs_info, "using %s (%s) checksum algorithm", btrfs_super_csum_name(csum_type), crypto_shash_driver_name(csum_shash)); return 0; } static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { int ret; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_root *log_tree_root; struct btrfs_super_block *disk_super = fs_info->super_copy; u64 bytenr = btrfs_super_log_root(disk_super); int level = btrfs_super_log_root_level(disk_super); if (fs_devices->rw_devices == 0) { btrfs_warn(fs_info, "log replay required on RO media"); return -EIO; } log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_KERNEL); if (!log_tree_root) return -ENOMEM; check.level = level; check.transid = fs_info->generation + 1; check.owner_root = BTRFS_TREE_LOG_OBJECTID; log_tree_root->node = read_tree_block(fs_info, bytenr, &check); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); log_tree_root->node = NULL; btrfs_put_root(log_tree_root); return ret; } if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); btrfs_put_root(log_tree_root); return -EIO; } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); btrfs_put_root(log_tree_root); return ret; } if (sb_rdonly(fs_info->sb)) { ret = btrfs_commit_super(fs_info); if (ret) return ret; } return 0; } static int load_global_roots_objectid(struct btrfs_root *tree_root, struct btrfs_path *path, u64 objectid, const char *name) { struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *root; u64 max_global_id = 0; int ret; struct btrfs_key key = { .objectid = objectid, .type = BTRFS_ROOT_ITEM_KEY, .offset = 0, }; bool found = false; /* If we have IGNOREDATACSUMS skip loading these roots. */ if (objectid == BTRFS_CSUM_TREE_OBJECTID && btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); return 0; } while (1) { ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); if (ret < 0) break; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(tree_root, path); if (ret) { if (ret > 0) ret = 0; break; } } ret = 0; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != objectid) break; btrfs_release_path(path); /* * Just worry about this for extent tree, it'll be the same for * everybody. */ if (objectid == BTRFS_EXTENT_TREE_OBJECTID) max_global_id = max(max_global_id, key.offset); found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) ret = PTR_ERR(root); break; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); ret = btrfs_global_root_insert(root); if (ret) { btrfs_put_root(root); break; } key.offset++; } btrfs_release_path(path); if (objectid == BTRFS_EXTENT_TREE_OBJECTID) fs_info->nr_global_roots = max_global_id + 1; if (!found || ret) { if (objectid == BTRFS_CSUM_TREE_OBJECTID) set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) ret = ret ? ret : -ENOENT; else ret = 0; btrfs_err(fs_info, "failed to load root %s", name); } return ret; } static int load_global_roots(struct btrfs_root *tree_root) { BTRFS_PATH_AUTO_FREE(path); int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = load_global_roots_objectid(tree_root, path, BTRFS_EXTENT_TREE_OBJECTID, "extent"); if (ret) return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_CSUM_TREE_OBJECTID, "csum"); if (ret) return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_FREE_SPACE_TREE_OBJECTID, "free space"); return ret; } static int btrfs_read_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; struct btrfs_key location; int ret; ASSERT(fs_info->tree_root); ret = load_global_roots(tree_root); if (ret) return ret; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { ret = PTR_ERR(root); goto out; } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->block_group_root = root; } } location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { ret = PTR_ERR(root); goto out; } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; } /* Initialize fs_info for all devices in any case */ ret = btrfs_init_devices_late(fs_info); if (ret) goto out; /* * This tree can share blocks with some other fs tree during relocation * and we need a proper setup by btrfs_get_fs_root */ root = btrfs_get_fs_root(tree_root->fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID, true); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { ret = PTR_ERR(root); goto out; } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->data_reloc_root = root; } location.objectid = BTRFS_QUOTA_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(root)) { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->quota_root = root; } location.objectid = BTRFS_UUID_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { ret = PTR_ERR(root); if (ret != -ENOENT) goto out; } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->uuid_root = root; } if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { ret = PTR_ERR(root); goto out; } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->stripe_root = root; } } return 0; out: btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", location.objectid, ret); return ret; } static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb) { unsigned int cur = 0; /* Offset inside the sys chunk array */ /* * At sb read time, fs_info is not fully initialized. Thus we have * to use super block sectorsize, which should have been validated. */ const u32 sectorsize = btrfs_super_sectorsize(sb); u32 sys_array_size = btrfs_super_sys_array_size(sb); if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { btrfs_err(fs_info, "system chunk array too big %u > %u", sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); return -EUCLEAN; } while (cur < sys_array_size) { struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; struct btrfs_key key; u64 type; u16 num_stripes; u32 len; int ret; disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); len = sizeof(*disk_key); if (cur + len > sys_array_size) goto short_read; cur += len; btrfs_disk_key_to_cpu(&key, disk_key); if (key.type != BTRFS_CHUNK_ITEM_KEY) { btrfs_err(fs_info, "unexpected item type %u in sys_array at offset %u", key.type, cur); return -EUCLEAN; } chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); num_stripes = btrfs_stack_chunk_num_stripes(chunk); if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) goto short_read; type = btrfs_stack_chunk_type(chunk); if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { btrfs_err(fs_info, "invalid chunk type %llu in sys_array at offset %u", type, cur); return -EUCLEAN; } ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset, sectorsize); if (ret < 0) return ret; cur += btrfs_chunk_item_size(num_stripes); } return 0; short_read: btrfs_err(fs_info, "super block sys chunk array short read, cur=%u sys_array_size=%u", cur, sys_array_size); return -EUCLEAN; } /* * Real super block validation * NOTE: super csum type and incompat features will not be checked here. * * @sb: super block to check * @mirror_num: the super block number to check its bytenr: * 0 the primary (1st) sb * 1, 2 2nd and 3rd backup copy * -1 skip bytenr check */ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num) { u64 nodesize = btrfs_super_nodesize(sb); u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS); if (btrfs_super_magic(sb) != BTRFS_MAGIC) { btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) { if (!ignore_flags) { btrfs_err(fs_info, "unrecognized or unsupported super flag 0x%llx", btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); ret = -EINVAL; } else { btrfs_info(fs_info, "unrecognized or unsupported super flags: 0x%llx, ignored", btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); } } if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "tree_root level too big: %d >= %d", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "chunk_root level too big: %d >= %d", btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "log_root level too big: %d >= %d", btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } /* * Check sectorsize and nodesize first, other check will need it. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } /* * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. * * For 4K page sized systems with non-debug builds, all 3 matches (4K). * For 4K page sized systems with debug builds, there are two block sizes * supported. (4K and 2K) * * We can support 16K sectorsize with 64K page size without problem, * but such sectorsize/pagesize combination doesn't make much sense. * 4K will be our future standard, PAGE_SIZE is supported from the very * beginning. */ if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE && sectorsize != BTRFS_MIN_BLOCKSIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); ret = -EINVAL; } if (!is_power_of_2(nodesize) || nodesize < sectorsize || nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid nodesize %llu", nodesize); ret = -EINVAL; } if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { btrfs_err(fs_info, "invalid leafsize %u, should be %llu", le32_to_cpu(sb->__unused_leafsize), nodesize); ret = -EINVAL; } /* Root alignment check */ if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { btrfs_warn(fs_info, "tree_root block unaligned: %llu", btrfs_super_root(sb)); ret = -EINVAL; } if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { btrfs_warn(fs_info, "chunk_root block unaligned: %llu", btrfs_super_chunk_root(sb)); ret = -EINVAL; } if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { btrfs_warn(fs_info, "log_root block unaligned: %llu", btrfs_super_log_root(sb)); ret = -EINVAL; } if (!fs_info->fs_devices->temp_fsid && memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", sb->fsid, fs_info->fs_devices->fsid); ret = -EINVAL; } if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid); ret = -EINVAL; } if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "dev_item UUID does not match metadata fsid: %pU != %pU", fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); ret = -EINVAL; } /* * Artificial requirement for block-group-tree to force newer features * (free-space-tree, no-holes) so the test matrix is smaller. */ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || !btrfs_fs_incompat(fs_info, NO_HOLES))) { btrfs_err(fs_info, "block-group-tree feature requires free-space-tree and no-holes"); ret = -EINVAL; } /* * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { btrfs_err(fs_info, "bytes_used is too small %llu", btrfs_super_bytes_used(sb)); ret = -EINVAL; } if (!is_power_of_2(btrfs_super_stripesize(sb))) { btrfs_err(fs_info, "invalid stripesize %u", btrfs_super_stripesize(sb)); ret = -EINVAL; } if (btrfs_super_num_devices(sb) > (1UL << 31)) btrfs_warn(fs_info, "suspicious number of devices: %llu", btrfs_super_num_devices(sb)); if (btrfs_super_num_devices(sb) == 0) { btrfs_err(fs_info, "number of devices is 0"); ret = -EINVAL; } if (mirror_num >= 0 && btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { btrfs_err(fs_info, "super offset mismatch %llu != %u", btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); ret = -EINVAL; } if (ret) return ret; ret = validate_sys_chunk_array(fs_info, sb); /* * Obvious sys_chunk_array corruptions, it must hold at least one key * and one chunk */ if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { btrfs_err(fs_info, "system chunk array too big %u > %u", btrfs_super_sys_array_size(sb), BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); ret = -EINVAL; } if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)) { btrfs_err(fs_info, "system chunk array too small %u < %zu", btrfs_super_sys_array_size(sb), sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)); ret = -EINVAL; } /* * The generation is a global counter, we'll trust it more than the others * but it's still possible that it's the one that's wrong. */ if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) btrfs_warn(fs_info, "suspicious: generation < chunk_root_generation: %llu < %llu", btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb)); if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) && btrfs_super_cache_generation(sb) != (u64)-1) btrfs_warn(fs_info, "suspicious: generation < cache_generation: %llu < %llu", btrfs_super_generation(sb), btrfs_super_cache_generation(sb)); return ret; } /* * Validation of super block at mount time. * Some checks already done early at mount time, like csum type and incompat * flags will be skipped. */ static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) { return btrfs_validate_super(fs_info, fs_info->super_copy, 0); } /* * Validation of super block at write time. * Some checks like bytenr check will be skipped as their values will be * overwritten soon. * Extra checks like csum type and incompat flags will be done here. */ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, struct btrfs_super_block *sb) { int ret; ret = btrfs_validate_super(fs_info, sb, -1); if (ret < 0) goto out; if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid csum type, has %u want %u", btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); goto out; } if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid incompat flags, has 0x%llx valid mask 0x%llx", btrfs_super_incompat_flags(sb), (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP); goto out; } out: if (ret < 0) btrfs_err(fs_info, "super block corruption detected before writing it to disk"); return ret; } static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) { struct btrfs_tree_parent_check check = { .level = level, .transid = gen, .owner_root = btrfs_root_id(root) }; int ret = 0; root->node = read_tree_block(root->fs_info, bytenr, &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; return ret; } if (!extent_buffer_uptodate(root->node)) { free_extent_buffer(root->node); root->node = NULL; return -EIO; } btrfs_set_root_node(&root->root_item, root->node); root->commit_root = btrfs_root_node(root); btrfs_set_root_refs(&root->root_item, 1); return ret; } static int load_important_roots(struct btrfs_fs_info *fs_info) { struct btrfs_super_block *sb = fs_info->super_copy; u64 gen, bytenr; int level, ret; bytenr = btrfs_super_root(sb); gen = btrfs_super_generation(sb); level = btrfs_super_root_level(sb); ret = load_super_root(fs_info->tree_root, bytenr, gen, level); if (ret) { btrfs_warn(fs_info, "couldn't read tree root"); return ret; } return 0; } static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) { int backup_index = find_newest_super_backup(fs_info); struct btrfs_super_block *sb = fs_info->super_copy; struct btrfs_root *tree_root = fs_info->tree_root; bool handle_error = false; int ret = 0; int i; for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { if (handle_error) { if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); tree_root->node = NULL; if (!btrfs_test_opt(fs_info, USEBACKUPROOT)) break; free_root_pointers(fs_info, 0); /* * Don't use the log in recovery mode, it won't be * valid */ btrfs_set_super_log_root(sb, 0); btrfs_warn(fs_info, "try to load backup roots slot %d", i); ret = read_backup_root(fs_info, i); backup_index = ret; if (ret < 0) return ret; } ret = load_important_roots(fs_info); if (ret) { handle_error = true; continue; } /* * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user */ ret = btrfs_init_root_free_objectid(tree_root); if (ret < 0) { handle_error = true; continue; } ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); ret = btrfs_read_roots(fs_info); if (ret < 0) { handle_error = true; continue; } /* All successful */ fs_info->generation = btrfs_header_generation(tree_root->node); btrfs_set_last_trans_committed(fs_info, fs_info->generation); fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ if (backup_index < 0) { fs_info->backup_root_index = 0; } else { fs_info->backup_root_index = backup_index + 1; fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS; } break; } return ret; } void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); spin_lock_init(&fs_info->zone_active_bgs_lock); spin_lock_init(&fs_info->relocation_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); rwlock_init(&fs_info->global_root_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->reclaim_bgs_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->zoned_meta_io_lock); mutex_init(&fs_info->zoned_data_reloc_io_lock); seqlock_init(&fs_info->profiles_lock); btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers); btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered); btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked, BTRFS_LOCKDEP_TRANS_UNBLOCKED); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed, BTRFS_LOCKDEP_TRANS_COMPLETED); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); INIT_LIST_HEAD(&fs_info->reclaim_bgs); INIT_LIST_HEAD(&fs_info->zone_active_bgs); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); INIT_LIST_HEAD(&fs_info->allocated_ebs); spin_lock_init(&fs_info->eb_leak_lock); #endif fs_info->mapping_tree = RB_ROOT_CACHED; rwlock_init(&fs_info->mapping_tree_lock); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); btrfs_init_block_rsv(&fs_info->delayed_refs_rsv, BTRFS_BLOCK_RSV_DELREFS); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->global_root_tree = RB_ROOT; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; atomic64_set(&fs_info->free_chunk_space, 0); fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); INIT_LIST_HEAD(&fs_info->ordered_roots); spin_lock_init(&fs_info->ordered_root_lock); btrfs_init_scrub(fs_info); btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); btrfs_init_extent_map_shrinker_work(fs_info); rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; extent_io_tree_init(fs_info, &fs_info->excluded_extents, IO_TREE_FS_EXCLUDED_EXTENTS); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->ro_block_group_mutex); init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); sema_init(&fs_info->uuid_tree_rescan_sem, 1); btrfs_init_dev_replace_locks(fs_info); btrfs_init_qgroup(fs_info); btrfs_discard_init(fs_info); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); init_waitqueue_head(&fs_info->delayed_iputs_wait); /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; fs_info->sectorsize = 4096; fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; /* Default compress algorithm when user does -o compress */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); } static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) { int ret; fs_info->sb = sb; /* Temporary fixed values for block size until we read the superblock. */ sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL); if (ret) return ret; ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL); if (ret) return ret; ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL); if (ret) return ret; fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) return ret; ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, GFP_KERNEL); if (ret) return ret; fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), GFP_KERNEL); if (!fs_info->delayed_root) return -ENOMEM; btrfs_init_delayed_root(fs_info->delayed_root); if (sb_rdonly(sb)) set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); if (btrfs_test_opt(fs_info, IGNOREMETACSUMS)) set_bit(BTRFS_FS_STATE_SKIP_META_CSUMS, &fs_info->fs_state); return btrfs_alloc_stripe_hash_table(fs_info); } static int btrfs_uuid_rescan_kthread(void *data) { struct btrfs_fs_info *fs_info = data; int ret; /* * 1st step is to iterate through the existing UUID tree and * to delete all entries that contain outdated data. * 2nd step is to add all missing entries to the UUID tree. */ ret = btrfs_uuid_tree_iterate(fs_info); if (ret < 0) { if (ret != -EINTR) btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); up(&fs_info->uuid_tree_rescan_sem); return ret; } return btrfs_uuid_scan_kthread(data); } static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) { struct task_struct *task; down(&fs_info->uuid_tree_rescan_sem); task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); if (IS_ERR(task)) { /* fs_info->update_uuid_tree_gen remains 0 in all error case */ btrfs_warn(fs_info, "failed to start uuid_rescan task"); up(&fs_info->uuid_tree_rescan_sem); return PTR_ERR(task); } return 0; } static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; struct btrfs_root *gang[8]; int ret = 0; while (1) { unsigned int found; spin_lock(&fs_info->fs_roots_radix_lock); found = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); if (!found) { spin_unlock(&fs_info->fs_roots_radix_lock); break; } root_objectid = btrfs_root_id(gang[found - 1]) + 1; for (int i = 0; i < found; i++) { /* Avoid to grab roots in dead_roots. */ if (btrfs_root_refs(&gang[i]->root_item) == 0) { gang[i] = NULL; continue; } /* Grab all the search result for later use. */ gang[i] = btrfs_grab_root(gang[i]); } spin_unlock(&fs_info->fs_roots_radix_lock); for (int i = 0; i < found; i++) { if (!gang[i]) continue; root_objectid = btrfs_root_id(gang[i]); /* * Continue to release the remaining roots after the first * error without cleanup and preserve the first error * for the return. */ if (!ret) ret = btrfs_orphan_cleanup(gang[i]); btrfs_put_root(gang[i]); } if (ret) break; root_objectid++; } return ret; } /* * Mounting logic specific to read-write file systems. Shared by open_ctree * and btrfs_remount when remounting from read-only to read-write. */ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) { int ret; const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); bool rebuild_free_space_tree = false; if (btrfs_test_opt(fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) btrfs_warn(fs_info, "'clear_cache' option is ignored with extent tree v2"); else rebuild_free_space_tree = true; } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { btrfs_warn(fs_info, "free space tree is invalid"); rebuild_free_space_tree = true; } if (rebuild_free_space_tree) { btrfs_info(fs_info, "rebuilding free space tree"); ret = btrfs_rebuild_free_space_tree(fs_info); if (ret) { btrfs_warn(fs_info, "failed to rebuild free space tree: %d", ret); goto out; } } if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "disabling free space tree"); ret = btrfs_delete_free_space_tree(fs_info); if (ret) { btrfs_warn(fs_info, "failed to disable free space tree: %d", ret); goto out; } } /* * btrfs_find_orphan_roots() is responsible for finding all the dead * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load * them into the fs_info->fs_roots_radix tree. This must be done before * calling btrfs_orphan_cleanup() on the tree root. If we don't do it * first, then btrfs_orphan_cleanup() will delete a dead root's orphan * item before the root's tree is deleted - this means that if we unmount * or crash before the deletion completes, on the next mount we will not * delete what remains of the tree because the orphan item does not * exists anymore, which is what tells us we have a pending deletion. */ ret = btrfs_find_orphan_roots(fs_info); if (ret) goto out; ret = btrfs_cleanup_fs_roots(fs_info); if (ret) goto out; down_read(&fs_info->cleanup_work_sem); if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { up_read(&fs_info->cleanup_work_sem); goto out; } up_read(&fs_info->cleanup_work_sem); mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(fs_info); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { btrfs_warn(fs_info, "failed to recover relocation: %d", ret); goto out; } if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); if (ret) { btrfs_warn(fs_info, "failed to create free space tree: %d", ret); goto out; } } if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) { ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); if (ret) goto out; } ret = btrfs_resume_balance_async(fs_info); if (ret) goto out; ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { btrfs_warn(fs_info, "failed to resume dev_replace"); goto out; } btrfs_qgroup_rescan_resume(fs_info); if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { btrfs_warn(fs_info, "failed to create the UUID tree %d", ret); goto out; } } out: return ret; } /* * Do various sanity and dependency checks of different features. * * @is_rw_mount: If the mount is read-write. * * This is the place for less strict checks (like for subpage or artificial * feature dependencies). * * For strict checks or possible corruption detection, see * btrfs_validate_super(). * * This should be called after btrfs_parse_options(), as some mount options * (space cache related) can modify on-disk format like free space tree and * screw up certain feature dependencies. */ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) { struct btrfs_super_block *disk_super = fs_info->super_copy; u64 incompat = btrfs_super_incompat_flags(disk_super); const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super); const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP); if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) { btrfs_err(fs_info, "cannot mount because of unknown incompat features (0x%llx)", incompat); return -EINVAL; } /* Runtime limitation for mixed block groups. */ if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (fs_info->sectorsize != fs_info->nodesize)) { btrfs_err(fs_info, "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", fs_info->nodesize, fs_info->sectorsize); return -EINVAL; } /* Mixed backref is an always-enabled feature. */ incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; /* Set compression related flags just in case. */ if (fs_info->compress_type == BTRFS_COMPRESS_LZO) incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; /* * An ancient flag, which should really be marked deprecated. * Such runtime limitation doesn't really need a incompat flag. */ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; if (compat_ro_unsupp && is_rw_mount) { btrfs_err(fs_info, "cannot mount read-write because of unknown compat_ro features (0x%llx)", compat_ro); return -EINVAL; } /* * We have unsupported RO compat features, although RO mounted, we * should not cause any metadata writes, including log replay. * Or we could screw up whatever the new feature requires. */ if (compat_ro_unsupp && btrfs_super_log_root(disk_super) && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { btrfs_err(fs_info, "cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", compat_ro); return -EINVAL; } /* * Artificial limitations for block group tree, to force * block-group-tree to rely on no-holes and free-space-tree. */ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && (!btrfs_fs_incompat(fs_info, NO_HOLES) || !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) { btrfs_err(fs_info, "block-group-tree feature requires no-holes and free-space-tree features"); return -EINVAL; } /* * Subpage runtime limitation on v1 cache. * * V1 space cache still has some hard codeed PAGE_SIZE usage, while * we're already defaulting to v2 cache, no need to bother v1 as it's * going to be deprecated anyway. */ if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { btrfs_warn(fs_info, "v1 space cache is not supported for page size %lu with sectorsize %u", PAGE_SIZE, fs_info->sectorsize); return -EINVAL; } /* This can be called by remount, we need to protect the super block. */ spin_lock(&fs_info->super_lock); btrfs_set_super_incompat_flags(disk_super, incompat); spin_unlock(&fs_info->super_lock); return 0; } int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { u32 sectorsize; u32 nodesize; u32 stripesize; u64 generation; u16 csum_type; struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; struct btrfs_root *chunk_root; int ret; int level; ret = init_mount_fs_info(fs_info, sb); if (ret) goto fail; /* These need to be init'ed before we start creating inodes and such. */ tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL); fs_info->tree_root = tree_root; chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, GFP_KERNEL); fs_info->chunk_root = chunk_root; if (!tree_root || !chunk_root) { ret = -ENOMEM; goto fail; } ret = btrfs_init_btree_inode(sb); if (ret) goto fail; invalidate_bdev(fs_devices->latest_dev->bdev); /* * Read super block and check the signature bytes only */ disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); if (IS_ERR(disk_super)) { ret = PTR_ERR(disk_super); goto fail_alloc; } btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid); /* * Verify the type first, if that or the checksum value are * corrupted, we'll find out */ csum_type = btrfs_super_csum_type(disk_super); if (!btrfs_supported_super_csum(csum_type)) { btrfs_err(fs_info, "unsupported checksum algorithm: %u", csum_type); ret = -EINVAL; btrfs_release_disk_super(disk_super); goto fail_alloc; } fs_info->csum_size = btrfs_super_csum_size(disk_super); ret = btrfs_init_csum_hash(fs_info, csum_type); if (ret) { btrfs_release_disk_super(disk_super); goto fail_alloc; } /* * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ if (btrfs_check_super_csum(fs_info, disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); ret = -EINVAL; btrfs_release_disk_super(disk_super); goto fail_alloc; } /* * super_copy is zeroed at allocation time and we never touch the * following bytes up to INFO_SIZE, the checksum is calculated from * the whole block of INFO_SIZE */ memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy)); btrfs_release_disk_super(disk_super); disk_super = fs_info->super_copy; memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_for_commit)); ret = btrfs_validate_mount_super(fs_info); if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); ret = -EINVAL; goto fail_alloc; } if (!btrfs_super_root(disk_super)) { btrfs_err(fs_info, "invalid superblock tree root bytenr"); ret = -EINVAL; goto fail_alloc; } /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) WRITE_ONCE(fs_info->fs_error, -EUCLEAN); /* Set up fs_info before parsing mount options */ nodesize = btrfs_super_nodesize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); stripesize = sectorsize; fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; /* * Handle the space caching options appropriately now that we have the * super block loaded and validated. */ btrfs_set_free_space_cache_settings(fs_info); if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) { ret = -EINVAL; goto fail_alloc; } ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); if (ret < 0) goto fail_alloc; /* * At this point our mount options are validated, if we set ->max_inline * to something non-standard make sure we truncate it to sectorsize. */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); /* Update the values for the current filesystem. */ sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE); mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(fs_info); mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_err(fs_info, "failed to read the system array: %d", ret); goto fail_sb_buffer; } generation = btrfs_super_chunk_root_generation(disk_super); level = btrfs_super_chunk_root_level(disk_super); ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super), generation, level); if (ret) { btrfs_err(fs_info, "failed to read chunk root"); goto fail_tree_roots; } read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, offsetof(struct btrfs_header, chunk_tree_uuid), BTRFS_UUID_SIZE); ret = btrfs_read_chunk_tree(fs_info); if (ret) { btrfs_err(fs_info, "failed to read chunk tree: %d", ret); goto fail_tree_roots; } /* * At this point we know all the devices that make this filesystem, * including the seed devices but we don't know yet if the replace * target is required. So free devices that are not part of this * filesystem but skip the replace target device which is checked * below in btrfs_init_dev_replace(). */ btrfs_free_extra_devids(fs_devices); if (!fs_devices->latest_dev->bdev) { btrfs_err(fs_info, "failed to read devices"); ret = -EIO; goto fail_tree_roots; } ret = init_tree_roots(fs_info); if (ret) goto fail_tree_roots; /* * Get zone type information of zoned block devices. This will also * handle emulation of a zoned filesystem if a regular device has the * zoned incompat feature flag set. */ ret = btrfs_get_dev_zone_info_all_devices(fs_info); if (ret) { btrfs_err(fs_info, "zoned: failed to read device zone info: %d", ret); goto fail_block_groups; } /* * If we have a uuid root and we're not being told to rescan we need to * check the generation here so we can set the * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the * transaction during a balance or the log replay without updating the * uuid generation, and then if we crash we would rescan the uuid tree, * even though it was perfectly fine. */ if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) && fs_info->generation == btrfs_super_uuid_tree_generation(disk_super)) set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); ret = btrfs_verify_dev_extents(fs_info); if (ret) { btrfs_err(fs_info, "failed to verify dev extents against chunks: %d", ret); goto fail_block_groups; } ret = btrfs_recover_balance(fs_info); if (ret) { btrfs_err(fs_info, "failed to recover balance: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_stats(fs_info); if (ret) { btrfs_err(fs_info, "failed to init dev_stats: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_replace(fs_info); if (ret) { btrfs_err(fs_info, "failed to init dev_replace: %d", ret); goto fail_block_groups; } ret = btrfs_check_zoned_mode(fs_info); if (ret) { btrfs_err(fs_info, "failed to initialize zoned mode: %d", ret); goto fail_block_groups; } ret = btrfs_sysfs_add_fsid(fs_devices); if (ret) { btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", ret); goto fail_block_groups; } ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); if (ret) { btrfs_err(fs_info, "failed to initialize space info: %d", ret); goto fail_sysfs; } ret = btrfs_read_block_groups(fs_info); if (ret) { btrfs_err(fs_info, "failed to read block groups: %d", ret); goto fail_sysfs; } btrfs_free_zone_cache(fs_info); btrfs_check_active_zone_reservation(fs_info); if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writable mount is not allowed due to too many missing devices"); ret = -EINVAL; goto fail_sysfs; } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) { ret = PTR_ERR(fs_info->cleaner_kthread); goto fail_sysfs; } fs_info->transaction_kthread = kthread_run(transaction_kthread, tree_root, "btrfs-transaction"); if (IS_ERR(fs_info->transaction_kthread)) { ret = PTR_ERR(fs_info->transaction_kthread); goto fail_cleaner; } ret = btrfs_read_qgroup_config(fs_info); if (ret) goto fail_trans_kthread; if (btrfs_build_ref_tree(fs_info)) btrfs_err(fs_info, "couldn't build ref tree"); /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); if (ret) goto fail_qgroup; } fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); if (IS_ERR(fs_info->fs_root)) { ret = PTR_ERR(fs_info->fs_root); btrfs_warn(fs_info, "failed to read fs tree: %d", ret); fs_info->fs_root = NULL; goto fail_qgroup; } if (sb_rdonly(sb)) return 0; ret = btrfs_start_pre_rw_mount(fs_info); if (ret) { close_ctree(fs_info); return ret; } btrfs_discard_resume(fs_info); if (fs_info->uuid_root && (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) { btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { btrfs_warn(fs_info, "failed to check the UUID tree: %d", ret); close_ctree(fs_info); return ret; } } set_bit(BTRFS_FS_OPEN, &fs_info->flags); /* Kick the cleaner thread so it'll start deleting snapshots. */ if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); return 0; fail_qgroup: btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); btrfs_cleanup_transaction(fs_info); btrfs_free_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); /* * make sure we're done with the btree inode before we stop our * kthreads */ filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_sysfs: btrfs_sysfs_remove_mounted(fs_info); fail_fsdev_sysfs: btrfs_sysfs_remove_fsid(fs_info->fs_devices); fail_block_groups: btrfs_put_block_group_cache(fs_info); fail_tree_roots: if (fs_info->data_reloc_root) btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root); free_root_pointers(fs_info, true); invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); fail_alloc: btrfs_mapping_tree_free(fs_info); iput(fs_info->btree_inode); fail: btrfs_close_devices(fs_info->fs_devices); ASSERT(ret < 0); return ret; } ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_super_write(struct bio *bio) { struct btrfs_device *device = bio->bi_private; struct folio_iter fi; bio_for_each_folio_all(fi, bio) { if (bio->bi_status) { btrfs_warn_rl_in_rcu(device->fs_info, "lost super block write due to IO error on %s (%d)", btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); /* Ensure failure if the primary sb fails. */ if (bio->bi_opf & REQ_FUA) atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR, &device->sb_write_errors); else atomic_inc(&device->sb_write_errors); } folio_unlock(fi.folio); folio_put(fi.folio); } bio_put(bio); } struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, bool drop_cache) { struct btrfs_super_block *super; struct page *page; u64 bytenr, bytenr_orig; struct address_space *mapping = bdev->bd_mapping; int ret; bytenr_orig = btrfs_sb_offset(copy_num); ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); if (ret == -ENOENT) return ERR_PTR(-EINVAL); else if (ret) return ERR_PTR(ret); if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); if (drop_cache) { /* This should only be called with the primary sb. */ ASSERT(copy_num == 0); /* * Drop the page of the primary superblock, so later read will * always read from the device. */ invalidate_inode_pages2_range(mapping, bytenr >> PAGE_SHIFT, (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); } page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); if (IS_ERR(page)) return ERR_CAST(page); super = page_address(page); if (btrfs_super_magic(super) != BTRFS_MAGIC) { btrfs_release_disk_super(super); return ERR_PTR(-ENODATA); } if (btrfs_super_bytenr(super) != bytenr_orig) { btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } return super; } struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) { struct btrfs_super_block *super, *latest = NULL; int i; u64 transid = 0; /* we would like to check all the supers, but that would make * a btrfs mount succeed after a mkfs from a different FS. * So, we need to add a special mount option to scan for * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { super = btrfs_read_dev_one_super(bdev, i, false); if (IS_ERR(super)) continue; if (!latest || btrfs_super_generation(super) > transid) { if (latest) btrfs_release_disk_super(super); latest = super; transid = btrfs_super_generation(super); } } return super; } /* * Write superblock @sb to the @device. Do not wait for completion, all the * folios we use for writing are locked. * * Write @max_mirrors copies of the superblock, where 0 means default that fit * the expected device size at commit time. Note that max_mirrors must be * same for write and wait phases. * * Return number of errors when folio is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { struct btrfs_fs_info *fs_info = device->fs_info; struct address_space *mapping = device->bdev->bd_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; int ret; u64 bytenr, bytenr_orig; atomic_set(&device->sb_write_errors, 0); if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) { struct folio *folio; struct bio *bio; struct btrfs_super_block *disk_super; size_t offset; bytenr_orig = btrfs_sb_offset(i); ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); if (ret == -ENOENT) { continue; } else if (ret < 0) { btrfs_err(device->fs_info, "couldn't get super block location for mirror %d", i); atomic_inc(&device->sb_write_errors); continue; } if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; btrfs_set_super_bytenr(sb, bytenr_orig); crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum); folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); if (IS_ERR(folio)) { btrfs_err(device->fs_info, "couldn't get super block page for bytenr %llu", bytenr); atomic_inc(&device->sb_write_errors); continue; } ASSERT(folio_order(folio) == 0); offset = offset_in_folio(folio, bytenr); disk_super = folio_address(folio) + offset; memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); /* * Directly use bios here instead of relying on the page cache * to do I/O, so we don't lose the ability to do integrity * checking. */ bio = bio_alloc(device->bdev, 1, REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, GFP_NOFS); bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset); /* * We FUA only the first super block. The others we allow to * go down lazy and there's a short window where the on-disk * copies might still contain the older version. */ if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; submit_bio(bio); if (btrfs_advance_sb_log(device, i)) atomic_inc(&device->sb_write_errors); } return atomic_read(&device->sb_write_errors) < i ? 0 : -1; } /* * Wait for write completion of superblocks done by write_dev_supers, * @max_mirrors same for write and wait phases. * * Return -1 if primary super block write failed or when there were no super block * copies written. Otherwise 0. */ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) { int i; int errors = 0; bool primary_failed = false; int ret; u64 bytenr; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { struct folio *folio; ret = btrfs_sb_log_location(device, i, READ, &bytenr); if (ret == -ENOENT) { break; } else if (ret < 0) { errors++; if (i == 0) primary_failed = true; continue; } if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; folio = filemap_get_folio(device->bdev->bd_mapping, bytenr >> PAGE_SHIFT); /* If the folio has been removed, then we know it completed. */ if (IS_ERR(folio)) continue; ASSERT(folio_order(folio) == 0); /* Folio will be unlocked once the write completes. */ folio_wait_locked(folio); folio_put(folio); } errors += atomic_read(&device->sb_write_errors); if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR) primary_failed = true; if (primary_failed) { btrfs_err(device->fs_info, "error writing primary super block to device %llu", device->devid); return -1; } return errors < i ? 0 : -1; } /* * endio for the write_dev_flush, this will wake anyone waiting * for the barrier when it is done */ static void btrfs_end_empty_barrier(struct bio *bio) { bio_uninit(bio); complete(bio->bi_private); } /* * Submit a flush request to the device if it supports it. Error handling is * done in the waiting counterpart. */ static void write_dev_flush(struct btrfs_device *device) { struct bio *bio = &device->flush_bio; device->last_flush_error = BLK_STS_OK; bio_init(bio, device->bdev, NULL, 0, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; submit_bio(bio); set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } /* * If the flush bio has been submitted by write_dev_flush, wait for it. * Return true for any error, and false otherwise. */ static bool wait_dev_flush(struct btrfs_device *device) { struct bio *bio = &device->flush_bio; if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return false; wait_for_completion_io(&device->flush_wait); if (bio->bi_status) { device->last_flush_error = bio->bi_status; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); return true; } return false; } /* * send an empty flush down to each device in parallel, * then wait for them */ static int barrier_all_devices(struct btrfs_fs_info *info) { struct list_head *head; struct btrfs_device *dev; int errors_wait = 0; lockdep_assert_held(&info->fs_devices->device_list_mutex); /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry(dev, head, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) continue; if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; write_dev_flush(dev); } /* wait for all the barriers */ list_for_each_entry(dev, head, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) { errors_wait++; continue; } if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; if (wait_dev_flush(dev)) errors_wait++; } /* * Checks last_flush_error of disks in order to determine the device * state. */ if (errors_wait && !btrfs_check_rw_degradable(info, NULL)) return -EIO; return 0; } int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) { int raid_type; int min_tolerated = INT_MAX; if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) min_tolerated = min_t(int, min_tolerated, btrfs_raid_array[BTRFS_RAID_SINGLE]. tolerated_failures); for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (raid_type == BTRFS_RAID_SINGLE) continue; if (!(flags & btrfs_raid_array[raid_type].bg_flag)) continue; min_tolerated = min_t(int, min_tolerated, btrfs_raid_array[raid_type]. tolerated_failures); } if (min_tolerated == INT_MAX) { pr_warn("BTRFS: unknown raid flag: %llu", flags); min_tolerated = 0; } return min_tolerated; } int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) { struct list_head *head; struct btrfs_device *dev; struct btrfs_super_block *sb; struct btrfs_dev_item *dev_item; int ret; int do_barriers; int max_errors; int total_errors = 0; u64 flags; do_barriers = !btrfs_test_opt(fs_info, NOBARRIER); /* * max_mirrors == 0 indicates we're from commit_transaction, * not from fsync where the tree roots in fs_info have not * been consistent on disk. */ if (max_mirrors == 0) backup_super_roots(fs_info); sb = fs_info->super_for_commit; dev_item = &sb->dev_item; mutex_lock(&fs_info->fs_devices->device_list_mutex); head = &fs_info->fs_devices->devices; max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1; if (do_barriers) { ret = barrier_all_devices(fs_info); if (ret) { mutex_unlock( &fs_info->fs_devices->device_list_mutex); btrfs_handle_fs_error(fs_info, ret, "errors while submitting device barriers."); return ret; } } list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; continue; } if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; btrfs_set_stack_device_generation(dev_item, 0); btrfs_set_stack_device_type(dev_item, dev->type); btrfs_set_stack_device_id(dev_item, dev->devid); btrfs_set_stack_device_total_bytes(dev_item, dev->commit_total_bytes); btrfs_set_stack_device_bytes_used(dev_item, dev->commit_bytes_used); btrfs_set_stack_device_io_align(dev_item, dev->io_align); btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid, BTRFS_FSID_SIZE); flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); ret = btrfs_validate_write_super(fs_info, sb); if (ret < 0) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_handle_fs_error(fs_info, -EUCLEAN, "unexpected superblock corruption detected"); return -EUCLEAN; } ret = write_dev_supers(dev, sb, max_mirrors); if (ret) total_errors++; } if (total_errors > max_errors) { btrfs_err(fs_info, "%d errors while writing supers", total_errors); mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ btrfs_handle_fs_error(fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } total_errors = 0; list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) continue; if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; ret = wait_dev_supers(dev, max_mirrors); if (ret) total_errors++; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { btrfs_handle_fs_error(fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } return 0; } /* Drop a fs root from the radix tree and free it. */ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { bool drop_ref = false; spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)btrfs_root_id(root)); if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); if (root->reloc_root) { btrfs_put_root(root->reloc_root); root->reloc_root = NULL; } } if (drop_ref) btrfs_put_root(root); } int btrfs_commit_super(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->cleaner_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_mutex); wake_up_process(fs_info->cleaner_kthread); /* wait until ongoing cleanup work done */ down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); return btrfs_commit_current_transaction(fs_info->tree_root); } static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) { struct btrfs_transaction *trans; struct btrfs_transaction *tmp; bool found = false; /* * This function is only called at the very end of close_ctree(), * thus no other running transaction, no need to take trans_lock. */ ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)); list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) { struct extent_state *cached = NULL; u64 dirty_bytes = 0; u64 cur = 0; u64 found_start; u64 found_end; found = true; while (find_first_extent_bit(&trans->dirty_pages, cur, &found_start, &found_end, EXTENT_DIRTY, &cached)) { dirty_bytes += found_end + 1 - found_start; cur = found_end + 1; } btrfs_warn(fs_info, "transaction %llu (with %llu dirty metadata bytes) is not committed", trans->transid, dirty_bytes); btrfs_cleanup_one_transaction(trans); if (trans == fs_info->running_transaction) fs_info->running_transaction = NULL; list_del_init(&trans->list); btrfs_put_transaction(trans); trace_btrfs_transaction_commit(fs_info); } ASSERT(!found); } void __cold close_ctree(struct btrfs_fs_info *fs_info) { int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); /* * If we had UNFINISHED_DROPS we could still be processing them, so * clear that bit and wake up relocation so it can stop. * We must do this before stopping the block group reclaim task, because * at btrfs_relocate_block_group() we wait for this bit, and after the * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will * return 1. */ btrfs_wake_unfinished_drop(fs_info); /* * We may have the reclaim task running and relocating a data block group, * in which case it may create delayed iputs. So stop it before we park * the cleaner kthread otherwise we can get new delayed iputs after * parking the cleaner, and that can make the async reclaim task to hang * if it's waiting for delayed iputs to complete, since the cleaner is * parked and can not run delayed iputs - this will make us hang when * trying to stop the async reclaim task. */ cancel_work_sync(&fs_info->reclaim_bgs_work); /* * We don't want the cleaner to start new transactions, add more delayed * iputs, etc. while we're closing. We can't use kthread_stop() yet * because that frees the task_struct, and the transaction kthread might * still try to wake up the cleaner. */ kthread_park(fs_info->cleaner_kthread); /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); /* avoid complains from lockdep et al., set sem back to initial state */ up(&fs_info->uuid_tree_rescan_sem); /* pause restriper - we want to resume on mount */ btrfs_pause_balance(fs_info); btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); /* wait for any defraggers to finish */ wait_event(fs_info->transaction_wait, (atomic_read(&fs_info->defrag_running) == 0)); /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); /* * Handle the error fs first, as it will flush and wait for all ordered * extents. This will generate delayed iputs, thus we want to handle * it first. */ if (unlikely(BTRFS_FS_ERROR(fs_info))) btrfs_error_commit_super(fs_info); /* * Wait for any fixup workers to complete. * If we don't wait for them here and they are still running by the time * we call kthread_stop() against the cleaner kthread further below, we * get an use-after-free on the cleaner because the fixup worker adds an * inode to the list of delayed iputs and then attempts to wakeup the * cleaner kthread, which was already stopped and destroyed. We parked * already the cleaner, but below we run all pending delayed iputs. */ btrfs_flush_workqueue(fs_info->fixup_workers); /* * Similar case here, we have to wait for delalloc workers before we * proceed below and stop the cleaner kthread, otherwise we trigger a * use-after-tree on the cleaner kthread task_struct when a delalloc * worker running submit_compressed_extents() adds a delayed iput, which * does a wake up on the cleaner kthread, which was already freed below * when we call kthread_stop(). */ btrfs_flush_workqueue(fs_info->delalloc_workers); /* * We can have ordered extents getting their last reference dropped from * the fs_info->workers queue because for async writes for data bios we * queue a work for that queue, at btrfs_wq_submit_bio(), that runs * run_one_async_done() which calls btrfs_bio_end_io() in case the bio * has an error, and that later function can do the final * btrfs_put_ordered_extent() on the ordered extent attached to the bio, * which adds a delayed iput for the inode. So we must flush the queue * so that we don't have delayed iputs after committing the current * transaction below and stopping the cleaner and transaction kthreads. */ btrfs_flush_workqueue(fs_info->workers); /* * When finishing a compressed write bio we schedule a work queue item * to finish an ordered extent - btrfs_finish_compressed_write_work() * calls btrfs_finish_ordered_extent() which in turns does a call to * btrfs_queue_ordered_fn(), and that queues the ordered extent * completion either in the endio_write_workers work queue or in the * fs_info->endio_freespace_worker work queue. We flush those queues * below, so before we flush them we must flush this queue for the * workers of compressed writes. */ flush_workqueue(fs_info->compressed_write_workers); /* * After we parked the cleaner kthread, ordered extents may have * completed and created new delayed iputs. If one of the async reclaim * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we * can hang forever trying to stop it, because if a delayed iput is * added after it ran btrfs_run_delayed_iputs() and before it called * btrfs_wait_on_delayed_iputs(), it will hang forever since there is * no one else to run iputs. * * So wait for all ongoing ordered extents to complete and then run * delayed iputs. This works because once we reach this point no one * can either create new ordered extents nor create delayed iputs * through some other means. * * Also note that btrfs_wait_ordered_roots() is not safe here, because * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, * but the delayed iput for the respective inode is made only when doing * the final btrfs_put_ordered_extent() (which must happen at * btrfs_finish_ordered_io() when we are unmounting). */ btrfs_flush_workqueue(fs_info->endio_write_workers); /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); btrfs_run_delayed_iputs(fs_info); /* There should be no more workload to generate new delayed iputs. */ set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); cancel_work_sync(&fs_info->em_shrinker_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); if (!sb_rdonly(fs_info->sb)) { /* * The cleaner kthread is stopped, so do one final pass over * unused block groups. */ btrfs_delete_unused_bgs(fs_info); /* * There might be existing delayed inode workers still running * and holding an empty delayed inode item. We must wait for * them to complete first because they can create a transaction. * This happens when someone calls btrfs_balance_delayed_items() * and then a transaction commit runs the same delayed nodes * before any delayed worker has done something with the nodes. * We must wait for any worker here and not at transaction * commit time since that could cause a deadlock. * This is a very rare case. */ btrfs_flush_workqueue(fs_info->delayed_workers); ret = btrfs_commit_super(fs_info); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); } kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); ASSERT(list_empty(&fs_info->delayed_iputs)); set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); if (btrfs_check_quota_leak(fs_info)) { WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); btrfs_err(fs_info, "qgroup reserved space leaked"); } btrfs_free_qgroup_config(fs_info); ASSERT(list_empty(&fs_info->delalloc_roots)); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { btrfs_info(fs_info, "at unmount delalloc count %lld", percpu_counter_sum(&fs_info->delalloc_bytes)); } if (percpu_counter_sum(&fs_info->ordered_bytes)) btrfs_info(fs_info, "at unmount dio bytes count %lld", percpu_counter_sum(&fs_info->ordered_bytes)); btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_put_block_group_cache(fs_info); /* * we must make sure there is not any read request to * submit after we stopping all workers. */ invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); /* We shouldn't have any transaction open at this point */ warn_about_uncommitted_trans(fs_info); clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); btrfs_free_fs_roots(fs_info); /* * We must free the block groups after dropping the fs_roots as we could * have had an IO error and have left over tree log blocks that aren't * cleaned up until the fs roots are freed. This makes the block group * accounting appear to be wrong because there's pending reserved bytes, * so make sure we do the block group cleanup afterwards. */ btrfs_free_block_groups(fs_info); iput(fs_info->btree_inode); btrfs_mapping_tree_free(fs_info); btrfs_close_devices(fs_info->fs_devices); } void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf) { struct btrfs_fs_info *fs_info = buf->fs_info; u64 transid = btrfs_header_generation(buf); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * This is a fast path so only do this check if we have sanity tests * enabled. Normal people shouldn't be using unmapped buffers as dirty * outside of the sanity tests. */ if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif /* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */ ASSERT(trans->transid == fs_info->generation); btrfs_assert_tree_write_locked(buf); if (unlikely(transid != fs_info->generation)) { btrfs_abort_transaction(trans, -EUCLEAN); btrfs_crit(fs_info, "dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu", buf->start, transid, fs_info->generation); } set_extent_buffer_dirty(buf); } static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info, int flush_delayed) { /* * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ int ret; if (current->flags & PF_MEMALLOC) return; if (flush_delayed) btrfs_balance_delayed_items(fs_info); ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, BTRFS_DIRTY_METADATA_THRESH, fs_info->dirty_metadata_batch); if (ret > 0) { balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping); } } void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info) { __btrfs_btree_balance_dirty(fs_info, 1); } void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) { __btrfs_btree_balance_dirty(fs_info, 0); } static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) { struct btrfs_root *gang[8]; u64 root_objectid = 0; int ret; spin_lock(&fs_info->fs_roots_radix_lock); while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang))) != 0) { int i; for (i = 0; i < ret; i++) gang[i] = btrfs_grab_root(gang[i]); spin_unlock(&fs_info->fs_roots_radix_lock); for (i = 0; i < ret; i++) { if (!gang[i]) continue; root_objectid = btrfs_root_id(gang[i]); btrfs_free_log(NULL, gang[i]); btrfs_put_root(gang[i]); } root_objectid++; spin_lock(&fs_info->fs_roots_radix_lock); } spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log_root_tree(NULL, fs_info); } static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct btrfs_ordered_extent *ordered; spin_lock(&root->ordered_extent_lock); /* * This will just short circuit the ordered completion stuff which will * make sure the ordered extent gets properly cleaned up. */ list_for_each_entry(ordered, &root->ordered_extents, root_extent_list) set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); spin_unlock(&root->ordered_extent_lock); } static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; LIST_HEAD(splice); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, ordered_root); list_move_tail(&root->ordered_root, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); btrfs_destroy_ordered_extents(root); cond_resched(); spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); /* * We need this here because if we've been flipped read-only we won't * get sync() from the umount, so we need to make sure any ordered * extents that haven't had their dirty pages IO start writeout yet * actually get run and error out properly. */ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); } static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; LIST_HEAD(splice); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); btrfs_del_delalloc_inode(btrfs_inode); spin_unlock(&root->delalloc_lock); /* * Make sure we get a live inode and that it'll not disappear * meanwhile. */ inode = igrab(&btrfs_inode->vfs_inode); if (inode) { unsigned int nofs_flag; nofs_flag = memalloc_nofs_save(); invalidate_inode_pages2(inode->i_mapping); memalloc_nofs_restore(nofs_flag); iput(inode); } spin_lock(&root->delalloc_lock); } spin_unlock(&root->delalloc_lock); } static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; LIST_HEAD(splice); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_root(root); BUG_ON(!root); spin_unlock(&fs_info->delalloc_root_lock); btrfs_destroy_delalloc_inodes(root); btrfs_put_root(root); spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); } static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark) { struct extent_buffer *eb; u64 start = 0; u64 end; while (find_first_extent_bit(dirty_pages, start, &start, &end, mark, NULL)) { clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = find_extent_buffer(fs_info, start); start += fs_info->nodesize; if (!eb) continue; btrfs_tree_lock(eb); wait_on_extent_buffer_writeback(eb); btrfs_clear_buffer_dirty(NULL, eb); btrfs_tree_unlock(eb); free_extent_buffer_stale(eb); } } } static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, struct extent_io_tree *unpin) { u64 start; u64 end; while (1) { struct extent_state *cached_state = NULL; /* * The btrfs_finish_extent_commit() may get the same range as * ours between find_first_extent_bit and clear_extent_dirty. * Hence, hold the unused_bg_unpin_mutex to avoid double unpin * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (!find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } clear_extent_dirty(unpin, start, end, &cached_state); free_extent_state(cached_state); btrfs_error_unpin_extent_range(fs_info, start, end); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } } static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) { struct inode *inode; inode = cache->io_ctl.inode; if (inode) { unsigned int nofs_flag; nofs_flag = memalloc_nofs_save(); invalidate_inode_pages2(inode->i_mapping); memalloc_nofs_restore(nofs_flag); BTRFS_I(inode)->generation = 0; cache->io_ctl.inode = NULL; iput(inode); } ASSERT(cache->io_ctl.pages == NULL); btrfs_put_block_group(cache); } void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, struct btrfs_fs_info *fs_info) { struct btrfs_block_group *cache; spin_lock(&cur_trans->dirty_bgs_lock); while (!list_empty(&cur_trans->dirty_bgs)) { cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group, dirty_list); if (!list_empty(&cache->io_list)) { spin_unlock(&cur_trans->dirty_bgs_lock); list_del_init(&cache->io_list); btrfs_cleanup_bg_io(cache); spin_lock(&cur_trans->dirty_bgs_lock); } list_del_init(&cache->dirty_list); spin_lock(&cache->lock); cache->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&cache->lock); spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_put_block_group(cache); btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); /* * Refer to the definition of io_bgs member for details why it's safe * to use it without any locking */ while (!list_empty(&cur_trans->io_bgs)) { cache = list_first_entry(&cur_trans->io_bgs, struct btrfs_block_group, io_list); list_del_init(&cache->io_list); spin_lock(&cache->lock); cache->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&cache->lock); btrfs_cleanup_bg_io(cache); } } static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) { struct btrfs_root *gang[8]; int i; int ret; spin_lock(&fs_info->fs_roots_radix_lock); while (1) { ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, (void **)gang, 0, ARRAY_SIZE(gang), BTRFS_ROOT_TRANS_TAG); if (ret == 0) break; for (i = 0; i < ret; i++) { struct btrfs_root *root = gang[i]; btrfs_qgroup_free_meta_all_pertrans(root); radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); } } spin_unlock(&fs_info->fs_roots_radix_lock); } void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans) { struct btrfs_fs_info *fs_info = cur_trans->fs_info; struct btrfs_device *dev, *tmp; btrfs_cleanup_dirty_bgs(cur_trans, fs_info); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list, post_commit_list) { list_del_init(&dev->post_commit_list); } btrfs_destroy_delayed_refs(cur_trans); cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&fs_info->transaction_wait); btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) { struct btrfs_transaction *t; mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); while (!list_empty(&fs_info->trans_list)) { t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); if (t->state >= TRANS_STATE_COMMIT_PREP) { refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); btrfs_put_transaction(t); spin_lock(&fs_info->trans_lock); continue; } if (t == fs_info->running_transaction) { t->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); /* * We wait for 0 num_writers since we don't hold a trans * handle open currently for this transaction. */ wait_event(t->writer_wait, atomic_read(&t->num_writers) == 0); } else { spin_unlock(&fs_info->trans_lock); } btrfs_cleanup_one_transaction(t); spin_lock(&fs_info->trans_lock); if (t == fs_info->running_transaction) fs_info->running_transaction = NULL; list_del_init(&t->list); spin_unlock(&fs_info->trans_lock); btrfs_put_transaction(t); trace_btrfs_transaction_commit(fs_info); spin_lock(&fs_info->trans_lock); } spin_unlock(&fs_info->trans_lock); btrfs_destroy_all_ordered_extents(fs_info); btrfs_destroy_delayed_inodes(fs_info); btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_all_delalloc_inodes(fs_info); btrfs_drop_all_logs(fs_info); btrfs_free_all_qgroup_pertrans(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; } int btrfs_init_root_free_objectid(struct btrfs_root *root) { BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *l; struct btrfs_key search_key; struct btrfs_key found_key; int slot; path = btrfs_alloc_path(); if (!path) return -ENOMEM; search_key.objectid = BTRFS_LAST_FREE_OBJECTID; search_key.type = -1; search_key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ return -EUCLEAN; } if (path->slots[0] > 0) { slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); root->free_objectid = max_t(u64, found_key.objectid + 1, BTRFS_FIRST_FREE_OBJECTID); } else { root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } return 0; } int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) { int ret; mutex_lock(&root->objectid_mutex); if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", btrfs_root_id(root)); ret = -ENOSPC; goto out; } *objectid = root->free_objectid++; ret = 0; out: mutex_unlock(&root->objectid_mutex); return ret; } |
| 7 1 1 1 1 1 1 1 315 75 36 2 221 155 86 86 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 | // SPDX-License-Identifier: GPL-2.0 /* * SCSI functions used by both the initiator and the target code. */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/module.h> #include <uapi/linux/pr.h> #include <linux/unaligned.h> #include <scsi/scsi_common.h> MODULE_DESCRIPTION("SCSI functions used by both the initiator and the target code"); MODULE_LICENSE("GPL v2"); /* Command group 3 is reserved and should never be used. */ const unsigned char scsi_command_size_tbl[8] = { 6, 10, 10, 12, 16, 12, 10, 10 }; EXPORT_SYMBOL(scsi_command_size_tbl); /* NB: These are exposed through /proc/scsi/scsi and form part of the ABI. * You may not alter any existing entry (although adding new ones is * encouraged once assigned by ANSI/INCITS T10). */ static const char *const scsi_device_types[] = { "Direct-Access ", "Sequential-Access", "Printer ", "Processor ", "WORM ", "CD-ROM ", "Scanner ", "Optical Device ", "Medium Changer ", "Communications ", "ASC IT8 ", "ASC IT8 ", "RAID ", "Enclosure ", "Direct-Access-RBC", "Optical card ", "Bridge controller", "Object storage ", "Automation/Drive ", "Security Manager ", "Direct-Access-ZBC", }; /** * scsi_device_type - Return 17-char string indicating device type. * @type: type number to look up */ const char *scsi_device_type(unsigned type) { if (type == 0x1e) return "Well-known LUN "; if (type == 0x1f) return "No Device "; if (type >= ARRAY_SIZE(scsi_device_types)) return "Unknown "; return scsi_device_types[type]; } EXPORT_SYMBOL(scsi_device_type); enum pr_type scsi_pr_type_to_block(enum scsi_pr_type type) { switch (type) { case SCSI_PR_WRITE_EXCLUSIVE: return PR_WRITE_EXCLUSIVE; case SCSI_PR_EXCLUSIVE_ACCESS: return PR_EXCLUSIVE_ACCESS; case SCSI_PR_WRITE_EXCLUSIVE_REG_ONLY: return PR_WRITE_EXCLUSIVE_REG_ONLY; case SCSI_PR_EXCLUSIVE_ACCESS_REG_ONLY: return PR_EXCLUSIVE_ACCESS_REG_ONLY; case SCSI_PR_WRITE_EXCLUSIVE_ALL_REGS: return PR_WRITE_EXCLUSIVE_ALL_REGS; case SCSI_PR_EXCLUSIVE_ACCESS_ALL_REGS: return PR_EXCLUSIVE_ACCESS_ALL_REGS; } return 0; } EXPORT_SYMBOL_GPL(scsi_pr_type_to_block); enum scsi_pr_type block_pr_type_to_scsi(enum pr_type type) { switch (type) { case PR_WRITE_EXCLUSIVE: return SCSI_PR_WRITE_EXCLUSIVE; case PR_EXCLUSIVE_ACCESS: return SCSI_PR_EXCLUSIVE_ACCESS; case PR_WRITE_EXCLUSIVE_REG_ONLY: return SCSI_PR_WRITE_EXCLUSIVE_REG_ONLY; case PR_EXCLUSIVE_ACCESS_REG_ONLY: return SCSI_PR_EXCLUSIVE_ACCESS_REG_ONLY; case PR_WRITE_EXCLUSIVE_ALL_REGS: return SCSI_PR_WRITE_EXCLUSIVE_ALL_REGS; case PR_EXCLUSIVE_ACCESS_ALL_REGS: return SCSI_PR_EXCLUSIVE_ACCESS_ALL_REGS; } return 0; } EXPORT_SYMBOL_GPL(block_pr_type_to_scsi); /** * scsilun_to_int - convert a scsi_lun to an int * @scsilun: struct scsi_lun to be converted. * * Description: * Convert @scsilun from a struct scsi_lun to a four-byte host byte-ordered * integer, and return the result. The caller must check for * truncation before using this function. * * Notes: * For a description of the LUN format, post SCSI-3 see the SCSI * Architecture Model, for SCSI-3 see the SCSI Controller Commands. * * Given a struct scsi_lun of: d2 04 0b 03 00 00 00 00, this function * returns the integer: 0x0b03d204 * * This encoding will return a standard integer LUN for LUNs smaller * than 256, which typically use a single level LUN structure with * addressing method 0. */ u64 scsilun_to_int(struct scsi_lun *scsilun) { int i; u64 lun; lun = 0; for (i = 0; i < sizeof(lun); i += 2) lun = lun | (((u64)scsilun->scsi_lun[i] << ((i + 1) * 8)) | ((u64)scsilun->scsi_lun[i + 1] << (i * 8))); return lun; } EXPORT_SYMBOL(scsilun_to_int); /** * int_to_scsilun - reverts an int into a scsi_lun * @lun: integer to be reverted * @scsilun: struct scsi_lun to be set. * * Description: * Reverts the functionality of the scsilun_to_int, which packed * an 8-byte lun value into an int. This routine unpacks the int * back into the lun value. * * Notes: * Given an integer : 0x0b03d204, this function returns a * struct scsi_lun of: d2 04 0b 03 00 00 00 00 * */ void int_to_scsilun(u64 lun, struct scsi_lun *scsilun) { int i; memset(scsilun->scsi_lun, 0, sizeof(scsilun->scsi_lun)); for (i = 0; i < sizeof(lun); i += 2) { scsilun->scsi_lun[i] = (lun >> 8) & 0xFF; scsilun->scsi_lun[i+1] = lun & 0xFF; lun = lun >> 16; } } EXPORT_SYMBOL(int_to_scsilun); /** * scsi_normalize_sense - normalize main elements from either fixed or * descriptor sense data format into a common format. * * @sense_buffer: byte array containing sense data returned by device * @sb_len: number of valid bytes in sense_buffer * @sshdr: pointer to instance of structure that common * elements are written to. * * Notes: * The "main elements" from sense data are: response_code, sense_key, * asc, ascq and additional_length (only for descriptor format). * * Typically this function can be called after a device has * responded to a SCSI command with the CHECK_CONDITION status. * * Return value: * true if valid sense data information found, else false; */ bool scsi_normalize_sense(const u8 *sense_buffer, int sb_len, struct scsi_sense_hdr *sshdr) { memset(sshdr, 0, sizeof(struct scsi_sense_hdr)); if (!sense_buffer || !sb_len) return false; sshdr->response_code = (sense_buffer[0] & 0x7f); if (!scsi_sense_valid(sshdr)) return false; if (sshdr->response_code >= 0x72) { /* * descriptor format */ if (sb_len > 1) sshdr->sense_key = (sense_buffer[1] & 0xf); if (sb_len > 2) sshdr->asc = sense_buffer[2]; if (sb_len > 3) sshdr->ascq = sense_buffer[3]; if (sb_len > 7) sshdr->additional_length = sense_buffer[7]; } else { /* * fixed format */ if (sb_len > 2) sshdr->sense_key = (sense_buffer[2] & 0xf); if (sb_len > 7) { sb_len = min(sb_len, sense_buffer[7] + 8); if (sb_len > 12) sshdr->asc = sense_buffer[12]; if (sb_len > 13) sshdr->ascq = sense_buffer[13]; } } return true; } EXPORT_SYMBOL(scsi_normalize_sense); /** * scsi_sense_desc_find - search for a given descriptor type in descriptor sense data format. * @sense_buffer: byte array of descriptor format sense data * @sb_len: number of valid bytes in sense_buffer * @desc_type: value of descriptor type to find * (e.g. 0 -> information) * * Notes: * only valid when sense data is in descriptor format * * Return value: * pointer to start of (first) descriptor if found else NULL */ const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len, int desc_type) { int add_sen_len, add_len, desc_len, k; const u8 * descp; if ((sb_len < 8) || (0 == (add_sen_len = sense_buffer[7]))) return NULL; if ((sense_buffer[0] < 0x72) || (sense_buffer[0] > 0x73)) return NULL; add_sen_len = (add_sen_len < (sb_len - 8)) ? add_sen_len : (sb_len - 8); descp = &sense_buffer[8]; for (desc_len = 0, k = 0; k < add_sen_len; k += desc_len) { descp += desc_len; add_len = (k < (add_sen_len - 1)) ? descp[1]: -1; desc_len = add_len + 2; if (descp[0] == desc_type) return descp; if (add_len < 0) // short descriptor ?? break; } return NULL; } EXPORT_SYMBOL(scsi_sense_desc_find); /** * scsi_build_sense_buffer - build sense data in a buffer * @desc: Sense format (non-zero == descriptor format, * 0 == fixed format) * @buf: Where to build sense data * @key: Sense key * @asc: Additional sense code * @ascq: Additional sense code qualifier * **/ void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq) { if (desc) { buf[0] = 0x72; /* descriptor, current */ buf[1] = key; buf[2] = asc; buf[3] = ascq; buf[7] = 0; } else { buf[0] = 0x70; /* fixed, current */ buf[2] = key; buf[7] = 0xa; buf[12] = asc; buf[13] = ascq; } } EXPORT_SYMBOL(scsi_build_sense_buffer); /** * scsi_set_sense_information - set the information field in a * formatted sense data buffer * @buf: Where to build sense data * @buf_len: buffer length * @info: 64-bit information value to be set * * Return value: * 0 on success or -EINVAL for invalid sense buffer length **/ int scsi_set_sense_information(u8 *buf, int buf_len, u64 info) { if ((buf[0] & 0x7f) == 0x72) { u8 *ucp, len; len = buf[7]; ucp = (char *)scsi_sense_desc_find(buf, len + 8, 0); if (!ucp) { buf[7] = len + 0xc; ucp = buf + 8 + len; } if (buf_len < len + 0xc) /* Not enough room for info */ return -EINVAL; ucp[0] = 0; ucp[1] = 0xa; ucp[2] = 0x80; /* Valid bit */ ucp[3] = 0; put_unaligned_be64(info, &ucp[4]); } else if ((buf[0] & 0x7f) == 0x70) { /* * Only set the 'VALID' bit if we can represent the value * correctly; otherwise just fill out the lower bytes and * clear the 'VALID' flag. */ if (info <= 0xffffffffUL) buf[0] |= 0x80; else buf[0] &= 0x7f; put_unaligned_be32((u32)info, &buf[3]); } return 0; } EXPORT_SYMBOL(scsi_set_sense_information); /** * scsi_set_sense_field_pointer - set the field pointer sense key * specific information in a formatted sense data buffer * @buf: Where to build sense data * @buf_len: buffer length * @fp: field pointer to be set * @bp: bit pointer to be set * @cd: command/data bit * * Return value: * 0 on success or -EINVAL for invalid sense buffer length */ int scsi_set_sense_field_pointer(u8 *buf, int buf_len, u16 fp, u8 bp, bool cd) { u8 *ucp, len; if ((buf[0] & 0x7f) == 0x72) { len = buf[7]; ucp = (char *)scsi_sense_desc_find(buf, len + 8, 2); if (!ucp) { buf[7] = len + 8; ucp = buf + 8 + len; } if (buf_len < len + 8) /* Not enough room for info */ return -EINVAL; ucp[0] = 2; ucp[1] = 6; ucp[4] = 0x80; /* Valid bit */ if (cd) ucp[4] |= 0x40; if (bp < 0x8) ucp[4] |= 0x8 | bp; put_unaligned_be16(fp, &ucp[5]); } else if ((buf[0] & 0x7f) == 0x70) { len = buf[7]; if (len < 18) buf[7] = 18; buf[15] = 0x80; if (cd) buf[15] |= 0x40; if (bp < 0x8) buf[15] |= 0x8 | bp; put_unaligned_be16(fp, &buf[16]); } return 0; } EXPORT_SYMBOL(scsi_set_sense_field_pointer); |
| 9 7 1 1 1 1 1 1 1 1 1 1 1 3 3 22 1 15 15 1 1 1 1 1 1 7 7 7 2 5 7 64 5 2 13 1 1 1 1 2 1 4 1 5 6 1 1 1 1 1 1 1 2 1 3 2 1 1 2 4 4 1 1 1 1 1 3 3 3 1 2 2 3 1 2 3 2 2 2 2 1 1 1 2 1 1 2 1 1 4 2 1 1 14 1 1 1 1 1 5 5 5 5 5 1 1 1 4 4 4 1 4 1 1 8 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 | // SPDX-License-Identifier: GPL-2.0-only /* * vivid-ctrls.c - control support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/videodev2.h> #include <media/v4l2-event.h> #include <media/v4l2-common.h> #include "vivid-core.h" #include "vivid-vid-cap.h" #include "vivid-vid-out.h" #include "vivid-vid-common.h" #include "vivid-radio-common.h" #include "vivid-osd.h" #include "vivid-ctrls.h" #define VIVID_CID_CUSTOM_BASE (V4L2_CID_USER_BASE | 0xf000) #define VIVID_CID_BUTTON (VIVID_CID_CUSTOM_BASE + 0) #define VIVID_CID_BOOLEAN (VIVID_CID_CUSTOM_BASE + 1) #define VIVID_CID_INTEGER (VIVID_CID_CUSTOM_BASE + 2) #define VIVID_CID_INTEGER64 (VIVID_CID_CUSTOM_BASE + 3) #define VIVID_CID_MENU (VIVID_CID_CUSTOM_BASE + 4) #define VIVID_CID_STRING (VIVID_CID_CUSTOM_BASE + 5) #define VIVID_CID_BITMASK (VIVID_CID_CUSTOM_BASE + 6) #define VIVID_CID_INTMENU (VIVID_CID_CUSTOM_BASE + 7) #define VIVID_CID_U32_ARRAY (VIVID_CID_CUSTOM_BASE + 8) #define VIVID_CID_U16_MATRIX (VIVID_CID_CUSTOM_BASE + 9) #define VIVID_CID_U8_4D_ARRAY (VIVID_CID_CUSTOM_BASE + 10) #define VIVID_CID_AREA (VIVID_CID_CUSTOM_BASE + 11) #define VIVID_CID_RO_INTEGER (VIVID_CID_CUSTOM_BASE + 12) #define VIVID_CID_U32_DYN_ARRAY (VIVID_CID_CUSTOM_BASE + 13) #define VIVID_CID_U8_PIXEL_ARRAY (VIVID_CID_CUSTOM_BASE + 14) #define VIVID_CID_S32_ARRAY (VIVID_CID_CUSTOM_BASE + 15) #define VIVID_CID_S64_ARRAY (VIVID_CID_CUSTOM_BASE + 16) #define VIVID_CID_RECT (VIVID_CID_CUSTOM_BASE + 17) #define VIVID_CID_VIVID_BASE (0x00f00000 | 0xf000) #define VIVID_CID_VIVID_CLASS (0x00f00000 | 1) #define VIVID_CID_TEST_PATTERN (VIVID_CID_VIVID_BASE + 0) #define VIVID_CID_OSD_TEXT_MODE (VIVID_CID_VIVID_BASE + 1) #define VIVID_CID_HOR_MOVEMENT (VIVID_CID_VIVID_BASE + 2) #define VIVID_CID_VERT_MOVEMENT (VIVID_CID_VIVID_BASE + 3) #define VIVID_CID_SHOW_BORDER (VIVID_CID_VIVID_BASE + 4) #define VIVID_CID_SHOW_SQUARE (VIVID_CID_VIVID_BASE + 5) #define VIVID_CID_INSERT_SAV (VIVID_CID_VIVID_BASE + 6) #define VIVID_CID_INSERT_EAV (VIVID_CID_VIVID_BASE + 7) #define VIVID_CID_VBI_CAP_INTERLACED (VIVID_CID_VIVID_BASE + 8) #define VIVID_CID_INSERT_HDMI_VIDEO_GUARD_BAND (VIVID_CID_VIVID_BASE + 9) #define VIVID_CID_HFLIP (VIVID_CID_VIVID_BASE + 20) #define VIVID_CID_VFLIP (VIVID_CID_VIVID_BASE + 21) #define VIVID_CID_STD_ASPECT_RATIO (VIVID_CID_VIVID_BASE + 22) #define VIVID_CID_DV_TIMINGS_ASPECT_RATIO (VIVID_CID_VIVID_BASE + 23) #define VIVID_CID_TSTAMP_SRC (VIVID_CID_VIVID_BASE + 24) #define VIVID_CID_COLORSPACE (VIVID_CID_VIVID_BASE + 25) #define VIVID_CID_XFER_FUNC (VIVID_CID_VIVID_BASE + 26) #define VIVID_CID_YCBCR_ENC (VIVID_CID_VIVID_BASE + 27) #define VIVID_CID_QUANTIZATION (VIVID_CID_VIVID_BASE + 28) #define VIVID_CID_LIMITED_RGB_RANGE (VIVID_CID_VIVID_BASE + 29) #define VIVID_CID_ALPHA_MODE (VIVID_CID_VIVID_BASE + 30) #define VIVID_CID_HAS_CROP_CAP (VIVID_CID_VIVID_BASE + 31) #define VIVID_CID_HAS_COMPOSE_CAP (VIVID_CID_VIVID_BASE + 32) #define VIVID_CID_HAS_SCALER_CAP (VIVID_CID_VIVID_BASE + 33) #define VIVID_CID_HAS_CROP_OUT (VIVID_CID_VIVID_BASE + 34) #define VIVID_CID_HAS_COMPOSE_OUT (VIVID_CID_VIVID_BASE + 35) #define VIVID_CID_HAS_SCALER_OUT (VIVID_CID_VIVID_BASE + 36) #define VIVID_CID_SEQ_WRAP (VIVID_CID_VIVID_BASE + 38) #define VIVID_CID_TIME_WRAP (VIVID_CID_VIVID_BASE + 39) #define VIVID_CID_MAX_EDID_BLOCKS (VIVID_CID_VIVID_BASE + 40) #define VIVID_CID_PERCENTAGE_FILL (VIVID_CID_VIVID_BASE + 41) #define VIVID_CID_REDUCED_FPS (VIVID_CID_VIVID_BASE + 42) #define VIVID_CID_HSV_ENC (VIVID_CID_VIVID_BASE + 43) #define VIVID_CID_STD_SIGNAL_MODE (VIVID_CID_VIVID_BASE + 60) #define VIVID_CID_STANDARD (VIVID_CID_VIVID_BASE + 61) #define VIVID_CID_DV_TIMINGS_SIGNAL_MODE (VIVID_CID_VIVID_BASE + 62) #define VIVID_CID_DV_TIMINGS (VIVID_CID_VIVID_BASE + 63) #define VIVID_CID_PERC_DROPPED (VIVID_CID_VIVID_BASE + 64) #define VIVID_CID_DISCONNECT (VIVID_CID_VIVID_BASE + 65) #define VIVID_CID_DQBUF_ERROR (VIVID_CID_VIVID_BASE + 66) #define VIVID_CID_QUEUE_SETUP_ERROR (VIVID_CID_VIVID_BASE + 67) #define VIVID_CID_BUF_PREPARE_ERROR (VIVID_CID_VIVID_BASE + 68) #define VIVID_CID_START_STR_ERROR (VIVID_CID_VIVID_BASE + 69) #define VIVID_CID_QUEUE_ERROR (VIVID_CID_VIVID_BASE + 70) #define VIVID_CID_CLEAR_FB (VIVID_CID_VIVID_BASE + 71) #define VIVID_CID_REQ_VALIDATE_ERROR (VIVID_CID_VIVID_BASE + 72) #define VIVID_CID_RADIO_SEEK_MODE (VIVID_CID_VIVID_BASE + 90) #define VIVID_CID_RADIO_SEEK_PROG_LIM (VIVID_CID_VIVID_BASE + 91) #define VIVID_CID_RADIO_RX_RDS_RBDS (VIVID_CID_VIVID_BASE + 92) #define VIVID_CID_RADIO_RX_RDS_BLOCKIO (VIVID_CID_VIVID_BASE + 93) #define VIVID_CID_RADIO_TX_RDS_BLOCKIO (VIVID_CID_VIVID_BASE + 94) #define VIVID_CID_SDR_CAP_FM_DEVIATION (VIVID_CID_VIVID_BASE + 110) #define VIVID_CID_META_CAP_GENERATE_PTS (VIVID_CID_VIVID_BASE + 111) #define VIVID_CID_META_CAP_GENERATE_SCR (VIVID_CID_VIVID_BASE + 112) /* HDMI inputs are in the range 0-14. The next available CID is VIVID_CID_VIVID_BASE + 128 */ #define VIVID_CID_HDMI_IS_CONNECTED_TO_OUTPUT(input) (VIVID_CID_VIVID_BASE + 113 + (input)) /* S-Video inputs are in the range 0-15. The next available CID is VIVID_CID_VIVID_BASE + 144 */ #define VIVID_CID_SVID_IS_CONNECTED_TO_OUTPUT(input) (VIVID_CID_VIVID_BASE + 128 + (input)) /* General User Controls */ static void vivid_unregister_dev(bool valid, struct video_device *vdev) { if (!valid) return; clear_bit(V4L2_FL_REGISTERED, &vdev->flags); v4l2_event_wake_all(vdev); } static int vivid_user_gen_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_user_gen); switch (ctrl->id) { case VIVID_CID_DISCONNECT: v4l2_info(&dev->v4l2_dev, "disconnect\n"); dev->disconnect_error = true; vivid_unregister_dev(dev->has_vid_cap, &dev->vid_cap_dev); vivid_unregister_dev(dev->has_vid_out, &dev->vid_out_dev); vivid_unregister_dev(dev->has_vbi_cap, &dev->vbi_cap_dev); vivid_unregister_dev(dev->has_vbi_out, &dev->vbi_out_dev); vivid_unregister_dev(dev->has_radio_rx, &dev->radio_rx_dev); vivid_unregister_dev(dev->has_radio_tx, &dev->radio_tx_dev); vivid_unregister_dev(dev->has_sdr_cap, &dev->sdr_cap_dev); vivid_unregister_dev(dev->has_meta_cap, &dev->meta_cap_dev); vivid_unregister_dev(dev->has_meta_out, &dev->meta_out_dev); vivid_unregister_dev(dev->has_touch_cap, &dev->touch_cap_dev); break; case VIVID_CID_BUTTON: dev->button_pressed = 30; break; } return 0; } static const struct v4l2_ctrl_ops vivid_user_gen_ctrl_ops = { .s_ctrl = vivid_user_gen_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_button = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_BUTTON, .name = "Button", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_ctrl_config vivid_ctrl_boolean = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_BOOLEAN, .name = "Boolean", .type = V4L2_CTRL_TYPE_BOOLEAN, .min = 0, .max = 1, .step = 1, .def = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_int32 = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_INTEGER, .name = "Integer 32 Bits", .type = V4L2_CTRL_TYPE_INTEGER, .min = 0xffffffff80000000ULL, .max = 0x7fffffff, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_int64 = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_INTEGER64, .name = "Integer 64 Bits", .type = V4L2_CTRL_TYPE_INTEGER64, .min = 0x8000000000000000ULL, .max = 0x7fffffffffffffffLL, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_u32_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_U32_ARRAY, .name = "U32 1 Element Array", .type = V4L2_CTRL_TYPE_U32, .def = 0x18, .min = 0x10, .max = 0x20000, .step = 1, .dims = { 1 }, }; static const struct v4l2_ctrl_config vivid_ctrl_u32_dyn_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_U32_DYN_ARRAY, .name = "U32 Dynamic Array", .type = V4L2_CTRL_TYPE_U32, .flags = V4L2_CTRL_FLAG_DYNAMIC_ARRAY, .def = 50, .min = 10, .max = 90, .step = 1, .dims = { 100 }, }; static const struct v4l2_ctrl_config vivid_ctrl_u16_matrix = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_U16_MATRIX, .name = "U16 8x16 Matrix", .type = V4L2_CTRL_TYPE_U16, .def = 0x18, .min = 0x10, .max = 0x2000, .step = 1, .dims = { 8, 16 }, }; static const struct v4l2_ctrl_config vivid_ctrl_u8_4d_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_U8_4D_ARRAY, .name = "U8 2x3x4x5 Array", .type = V4L2_CTRL_TYPE_U8, .def = 0x18, .min = 0x10, .max = 0x20, .step = 1, .dims = { 2, 3, 4, 5 }, }; static const struct v4l2_ctrl_config vivid_ctrl_u8_pixel_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_U8_PIXEL_ARRAY, .name = "U8 Pixel Array", .type = V4L2_CTRL_TYPE_U8, .def = 0x80, .min = 0x00, .max = 0xff, .step = 1, .dims = { 640 / PIXEL_ARRAY_DIV, 360 / PIXEL_ARRAY_DIV }, }; static const struct v4l2_ctrl_config vivid_ctrl_s32_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_S32_ARRAY, .name = "S32 2 Element Array", .type = V4L2_CTRL_TYPE_INTEGER, .def = 2, .min = -10, .max = 10, .step = 1, .dims = { 2 }, }; static const struct v4l2_ctrl_config vivid_ctrl_s64_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_S64_ARRAY, .name = "S64 5 Element Array", .type = V4L2_CTRL_TYPE_INTEGER64, .def = 4, .min = -10, .max = 10, .step = 1, .dims = { 5 }, }; static const char * const vivid_ctrl_menu_strings[] = { "Menu Item 0 (Skipped)", "Menu Item 1", "Menu Item 2 (Skipped)", "Menu Item 3", "Menu Item 4", "Menu Item 5 (Skipped)", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_menu = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_MENU, .name = "Menu", .type = V4L2_CTRL_TYPE_MENU, .min = 1, .max = 4, .def = 3, .menu_skip_mask = 0x04, .qmenu = vivid_ctrl_menu_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_string = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_STRING, .name = "String", .type = V4L2_CTRL_TYPE_STRING, .min = 2, .max = 4, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_bitmask = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_BITMASK, .name = "Bitmask", .type = V4L2_CTRL_TYPE_BITMASK, .def = 0x80002000, .min = 0, .max = 0x80402010, .step = 0, }; static const s64 vivid_ctrl_int_menu_values[] = { 1, 1, 2, 3, 5, 8, 13, 21, 42, }; static const struct v4l2_ctrl_config vivid_ctrl_int_menu = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_INTMENU, .name = "Integer Menu", .type = V4L2_CTRL_TYPE_INTEGER_MENU, .min = 1, .max = 8, .def = 4, .menu_skip_mask = 0x02, .qmenu_int = vivid_ctrl_int_menu_values, }; static const struct v4l2_ctrl_config vivid_ctrl_disconnect = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_DISCONNECT, .name = "Disconnect", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_area area = { .width = 1000, .height = 2000, }; static const struct v4l2_ctrl_config vivid_ctrl_area = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_AREA, .name = "Area", .type = V4L2_CTRL_TYPE_AREA, .p_def.p_const = &area, }; static const struct v4l2_ctrl_config vivid_ctrl_ro_int32 = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_RO_INTEGER, .name = "Read-Only Integer 32 Bits", .type = V4L2_CTRL_TYPE_INTEGER, .flags = V4L2_CTRL_FLAG_READ_ONLY, .min = 0, .max = 255, .step = 1, }; static const struct v4l2_rect rect_def = { .top = 100, .left = 200, .width = 300, .height = 400, }; static const struct v4l2_rect rect_min = { .top = 0, .left = 0, .width = 1, .height = 1, }; static const struct v4l2_rect rect_max = { .top = 0, .left = 0, .width = 1000, .height = 2000, }; static const struct v4l2_ctrl_config vivid_ctrl_rect = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_RECT, .name = "Rect", .type = V4L2_CTRL_TYPE_RECT, .flags = V4L2_CTRL_FLAG_HAS_WHICH_MIN_MAX, .p_def.p_const = &rect_def, .p_min.p_const = &rect_min, .p_max.p_const = &rect_max, }; /* Framebuffer Controls */ static int vivid_fb_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_fb); switch (ctrl->id) { case VIVID_CID_CLEAR_FB: vivid_fb_clear(dev); break; } return 0; } static const struct v4l2_ctrl_ops vivid_fb_ctrl_ops = { .s_ctrl = vivid_fb_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_clear_fb = { .ops = &vivid_fb_ctrl_ops, .id = VIVID_CID_CLEAR_FB, .name = "Clear Framebuffer", .type = V4L2_CTRL_TYPE_BUTTON, }; /* Video User Controls */ static int vivid_user_vid_g_volatile_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_user_vid); switch (ctrl->id) { case V4L2_CID_AUTOGAIN: dev->gain->val = (jiffies_to_msecs(jiffies) / 1000) & 0xff; break; } return 0; } static int vivid_user_vid_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_user_vid); switch (ctrl->id) { case V4L2_CID_BRIGHTNESS: dev->input_brightness[dev->input] = ctrl->val - dev->input * 128; tpg_s_brightness(&dev->tpg, dev->input_brightness[dev->input]); break; case V4L2_CID_CONTRAST: tpg_s_contrast(&dev->tpg, ctrl->val); break; case V4L2_CID_SATURATION: tpg_s_saturation(&dev->tpg, ctrl->val); break; case V4L2_CID_HUE: tpg_s_hue(&dev->tpg, ctrl->val); break; case V4L2_CID_HFLIP: dev->hflip = ctrl->val; tpg_s_hflip(&dev->tpg, dev->sensor_hflip ^ dev->hflip); break; case V4L2_CID_VFLIP: dev->vflip = ctrl->val; tpg_s_vflip(&dev->tpg, dev->sensor_vflip ^ dev->vflip); break; case V4L2_CID_ALPHA_COMPONENT: tpg_s_alpha_component(&dev->tpg, ctrl->val); break; } return 0; } static const struct v4l2_ctrl_ops vivid_user_vid_ctrl_ops = { .g_volatile_ctrl = vivid_user_vid_g_volatile_ctrl, .s_ctrl = vivid_user_vid_s_ctrl, }; /* Video Capture Controls */ static void vivid_update_power_present(struct vivid_dev *dev) { unsigned int i, j; dev->power_present = 0; for (i = 0, j = 0; i < ARRAY_SIZE(dev->dv_timings_signal_mode); i++) { if (dev->input_type[i] != HDMI) continue; /* * If connected to TPG or HDMI output, and the signal * mode is not NO_SIGNAL, then there is power present. */ if (dev->input_is_connected_to_output[i] != 1 && dev->dv_timings_signal_mode[i] != NO_SIGNAL) dev->power_present |= (1 << j); j++; } __v4l2_ctrl_s_ctrl(dev->ctrl_rx_power_present, dev->power_present); v4l2_ctrl_activate(dev->ctrl_dv_timings, dev->dv_timings_signal_mode[dev->input] == SELECTED_DV_TIMINGS); } static int vivid_vid_cap_s_ctrl(struct v4l2_ctrl *ctrl) { static const u32 colorspaces[] = { V4L2_COLORSPACE_SMPTE170M, V4L2_COLORSPACE_REC709, V4L2_COLORSPACE_SRGB, V4L2_COLORSPACE_OPRGB, V4L2_COLORSPACE_BT2020, V4L2_COLORSPACE_DCI_P3, V4L2_COLORSPACE_SMPTE240M, V4L2_COLORSPACE_470_SYSTEM_M, V4L2_COLORSPACE_470_SYSTEM_BG, }; struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_vid_cap); unsigned int i; struct vivid_dev *output_inst = NULL; int index = 0; int hdmi_index, svid_index; s32 input_index = 0; switch (ctrl->id) { case VIVID_CID_TEST_PATTERN: vivid_update_quality(dev); tpg_s_pattern(&dev->tpg, ctrl->val); break; case VIVID_CID_COLORSPACE: tpg_s_colorspace(&dev->tpg, colorspaces[ctrl->val]); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); vivid_send_source_change(dev, HDMI); vivid_send_source_change(dev, WEBCAM); break; case VIVID_CID_XFER_FUNC: tpg_s_xfer_func(&dev->tpg, ctrl->val); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); vivid_send_source_change(dev, HDMI); vivid_send_source_change(dev, WEBCAM); break; case VIVID_CID_YCBCR_ENC: tpg_s_ycbcr_enc(&dev->tpg, ctrl->val); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); vivid_send_source_change(dev, HDMI); vivid_send_source_change(dev, WEBCAM); break; case VIVID_CID_HSV_ENC: tpg_s_hsv_enc(&dev->tpg, ctrl->val ? V4L2_HSV_ENC_256 : V4L2_HSV_ENC_180); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); vivid_send_source_change(dev, HDMI); vivid_send_source_change(dev, WEBCAM); break; case VIVID_CID_QUANTIZATION: tpg_s_quantization(&dev->tpg, ctrl->val); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); vivid_send_source_change(dev, HDMI); vivid_send_source_change(dev, WEBCAM); break; case V4L2_CID_DV_RX_RGB_RANGE: if (!vivid_is_hdmi_cap(dev)) break; tpg_s_rgb_range(&dev->tpg, ctrl->val); break; case VIVID_CID_LIMITED_RGB_RANGE: tpg_s_real_rgb_range(&dev->tpg, ctrl->val ? V4L2_DV_RGB_RANGE_LIMITED : V4L2_DV_RGB_RANGE_FULL); break; case VIVID_CID_ALPHA_MODE: tpg_s_alpha_mode(&dev->tpg, ctrl->val); break; case VIVID_CID_HOR_MOVEMENT: tpg_s_mv_hor_mode(&dev->tpg, ctrl->val); break; case VIVID_CID_VERT_MOVEMENT: tpg_s_mv_vert_mode(&dev->tpg, ctrl->val); break; case VIVID_CID_OSD_TEXT_MODE: dev->osd_mode = ctrl->val; break; case VIVID_CID_PERCENTAGE_FILL: tpg_s_perc_fill(&dev->tpg, ctrl->val); for (i = 0; i < MAX_VID_CAP_BUFFERS; i++) dev->must_blank[i] = ctrl->val < 100; break; case VIVID_CID_INSERT_SAV: tpg_s_insert_sav(&dev->tpg, ctrl->val); break; case VIVID_CID_INSERT_EAV: tpg_s_insert_eav(&dev->tpg, ctrl->val); break; case VIVID_CID_INSERT_HDMI_VIDEO_GUARD_BAND: tpg_s_insert_hdmi_video_guard_band(&dev->tpg, ctrl->val); break; case VIVID_CID_HFLIP: dev->sensor_hflip = ctrl->val; tpg_s_hflip(&dev->tpg, dev->sensor_hflip ^ dev->hflip); break; case VIVID_CID_VFLIP: dev->sensor_vflip = ctrl->val; tpg_s_vflip(&dev->tpg, dev->sensor_vflip ^ dev->vflip); break; case VIVID_CID_REDUCED_FPS: dev->reduced_fps = ctrl->val; vivid_update_format_cap(dev, true); break; case VIVID_CID_HAS_CROP_CAP: dev->has_crop_cap = ctrl->val; vivid_update_format_cap(dev, true); break; case VIVID_CID_HAS_COMPOSE_CAP: dev->has_compose_cap = ctrl->val; vivid_update_format_cap(dev, true); break; case VIVID_CID_HAS_SCALER_CAP: dev->has_scaler_cap = ctrl->val; vivid_update_format_cap(dev, true); break; case VIVID_CID_SHOW_BORDER: tpg_s_show_border(&dev->tpg, ctrl->val); break; case VIVID_CID_SHOW_SQUARE: tpg_s_show_square(&dev->tpg, ctrl->val); break; case VIVID_CID_STD_ASPECT_RATIO: dev->std_aspect_ratio[dev->input] = ctrl->val; tpg_s_video_aspect(&dev->tpg, vivid_get_video_aspect(dev)); break; case VIVID_CID_DV_TIMINGS_SIGNAL_MODE: dev->dv_timings_signal_mode[dev->input] = dev->ctrl_dv_timings_signal_mode->val; dev->query_dv_timings[dev->input] = dev->ctrl_dv_timings->val; vivid_update_power_present(dev); vivid_update_quality(dev); vivid_send_input_source_change(dev, dev->input); break; case VIVID_CID_DV_TIMINGS_ASPECT_RATIO: dev->dv_timings_aspect_ratio[dev->input] = ctrl->val; tpg_s_video_aspect(&dev->tpg, vivid_get_video_aspect(dev)); break; case VIVID_CID_TSTAMP_SRC: dev->tstamp_src_is_soe = ctrl->val; dev->vb_vid_cap_q.timestamp_flags &= ~V4L2_BUF_FLAG_TSTAMP_SRC_MASK; if (dev->tstamp_src_is_soe) dev->vb_vid_cap_q.timestamp_flags |= V4L2_BUF_FLAG_TSTAMP_SRC_SOE; break; case VIVID_CID_MAX_EDID_BLOCKS: dev->edid_max_blocks = ctrl->val; if (dev->edid_blocks > dev->edid_max_blocks) dev->edid_blocks = dev->edid_max_blocks; break; case VIVID_CID_HDMI_IS_CONNECTED_TO_OUTPUT(0) ... VIVID_CID_HDMI_IS_CONNECTED_TO_OUTPUT(14): hdmi_index = ctrl->id - VIVID_CID_HDMI_IS_CONNECTED_TO_OUTPUT(0); output_inst = vivid_ctrl_hdmi_to_output_instance[ctrl->cur.val]; index = vivid_ctrl_hdmi_to_output_index[ctrl->cur.val]; input_index = dev->hdmi_index_to_input_index[hdmi_index]; dev->input_is_connected_to_output[input_index] = ctrl->val; if (output_inst) { output_inst->output_to_input_instance[index] = NULL; vivid_update_outputs(output_inst); cec_phys_addr_invalidate(output_inst->cec_tx_adap[index]); } if (ctrl->val >= FIXED_MENU_ITEMS) { output_inst = vivid_ctrl_hdmi_to_output_instance[ctrl->val]; index = vivid_ctrl_hdmi_to_output_index[ctrl->val]; output_inst->output_to_input_instance[index] = dev; output_inst->output_to_input_index[index] = dev->hdmi_index_to_input_index[hdmi_index]; } spin_lock(&hdmi_output_skip_mask_lock); hdmi_to_output_menu_skip_mask &= ~(1ULL << ctrl->cur.val); if (ctrl->val >= FIXED_MENU_ITEMS) hdmi_to_output_menu_skip_mask |= 1ULL << ctrl->val; spin_unlock(&hdmi_output_skip_mask_lock); vivid_update_power_present(dev); vivid_update_quality(dev); vivid_send_input_source_change(dev, dev->hdmi_index_to_input_index[hdmi_index]); if (ctrl->val < FIXED_MENU_ITEMS && ctrl->cur.val < FIXED_MENU_ITEMS) break; spin_lock(&hdmi_output_skip_mask_lock); hdmi_input_update_outputs_mask |= 1 << dev->inst; spin_unlock(&hdmi_output_skip_mask_lock); queue_work(update_hdmi_ctrls_workqueue, &dev->update_hdmi_ctrl_work); break; case VIVID_CID_SVID_IS_CONNECTED_TO_OUTPUT(0) ... VIVID_CID_SVID_IS_CONNECTED_TO_OUTPUT(15): svid_index = ctrl->id - VIVID_CID_SVID_IS_CONNECTED_TO_OUTPUT(0); output_inst = vivid_ctrl_svid_to_output_instance[ctrl->cur.val]; index = vivid_ctrl_svid_to_output_index[ctrl->cur.val]; input_index = dev->svid_index_to_input_index[svid_index]; dev->input_is_connected_to_output[input_index] = ctrl->val; if (output_inst) output_inst->output_to_input_instance[index] = NULL; if (ctrl->val >= FIXED_MENU_ITEMS) { output_inst = vivid_ctrl_svid_to_output_instance[ctrl->val]; index = vivid_ctrl_svid_to_output_index[ctrl->val]; output_inst->output_to_input_instance[index] = dev; output_inst->output_to_input_index[index] = dev->svid_index_to_input_index[svid_index]; } spin_lock(&svid_output_skip_mask_lock); svid_to_output_menu_skip_mask &= ~(1ULL << ctrl->cur.val); if (ctrl->val >= FIXED_MENU_ITEMS) svid_to_output_menu_skip_mask |= 1ULL << ctrl->val; spin_unlock(&svid_output_skip_mask_lock); vivid_update_quality(dev); vivid_send_input_source_change(dev, dev->svid_index_to_input_index[svid_index]); if (ctrl->val < FIXED_MENU_ITEMS && ctrl->cur.val < FIXED_MENU_ITEMS) break; queue_work(update_svid_ctrls_workqueue, &dev->update_svid_ctrl_work); break; } return 0; } static const struct v4l2_ctrl_ops vivid_vid_cap_ctrl_ops = { .s_ctrl = vivid_vid_cap_s_ctrl, }; static const char * const vivid_ctrl_hor_movement_strings[] = { "Move Left Fast", "Move Left", "Move Left Slow", "No Movement", "Move Right Slow", "Move Right", "Move Right Fast", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_hor_movement = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HOR_MOVEMENT, .name = "Horizontal Movement", .type = V4L2_CTRL_TYPE_MENU, .max = TPG_MOVE_POS_FAST, .def = TPG_MOVE_NONE, .qmenu = vivid_ctrl_hor_movement_strings, }; static const char * const vivid_ctrl_vert_movement_strings[] = { "Move Up Fast", "Move Up", "Move Up Slow", "No Movement", "Move Down Slow", "Move Down", "Move Down Fast", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_vert_movement = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_VERT_MOVEMENT, .name = "Vertical Movement", .type = V4L2_CTRL_TYPE_MENU, .max = TPG_MOVE_POS_FAST, .def = TPG_MOVE_NONE, .qmenu = vivid_ctrl_vert_movement_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_show_border = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_SHOW_BORDER, .name = "Show Border", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_show_square = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_SHOW_SQUARE, .name = "Show Square", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const char * const vivid_ctrl_osd_mode_strings[] = { "All", "Counters Only", "None", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_osd_mode = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_OSD_TEXT_MODE, .name = "OSD Text Mode", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_osd_mode_strings) - 2, .qmenu = vivid_ctrl_osd_mode_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_perc_fill = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_PERCENTAGE_FILL, .name = "Fill Percentage of Frame", .type = V4L2_CTRL_TYPE_INTEGER, .min = 0, .max = 100, .def = 100, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_insert_sav = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_INSERT_SAV, .name = "Insert SAV Code in Image", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_insert_eav = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_INSERT_EAV, .name = "Insert EAV Code in Image", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_insert_hdmi_video_guard_band = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_INSERT_HDMI_VIDEO_GUARD_BAND, .name = "Insert Video Guard Band", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_hflip = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HFLIP, .name = "Sensor Flipped Horizontally", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_vflip = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_VFLIP, .name = "Sensor Flipped Vertically", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_reduced_fps = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_REDUCED_FPS, .name = "Reduced Framerate", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_has_crop_cap = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HAS_CROP_CAP, .name = "Enable Capture Cropping", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_has_compose_cap = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HAS_COMPOSE_CAP, .name = "Enable Capture Composing", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_has_scaler_cap = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HAS_SCALER_CAP, .name = "Enable Capture Scaler", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const char * const vivid_ctrl_tstamp_src_strings[] = { "End of Frame", "Start of Exposure", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_tstamp_src = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_TSTAMP_SRC, .name = "Timestamp Source", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_tstamp_src_strings) - 2, .qmenu = vivid_ctrl_tstamp_src_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_std_aspect_ratio = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_STD_ASPECT_RATIO, .name = "Standard Aspect Ratio", .type = V4L2_CTRL_TYPE_MENU, .min = 1, .max = 4, .def = 1, .qmenu = tpg_aspect_strings, }; static const char * const vivid_ctrl_dv_timings_signal_mode_strings[] = { "Current DV Timings", "No Signal", "No Lock", "Out of Range", "Selected DV Timings", "Cycle Through All DV Timings", "Custom DV Timings", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_dv_timings_signal_mode = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_DV_TIMINGS_SIGNAL_MODE, .name = "DV Timings Signal Mode", .type = V4L2_CTRL_TYPE_MENU, .max = 5, .qmenu = vivid_ctrl_dv_timings_signal_mode_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_dv_timings_aspect_ratio = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_DV_TIMINGS_ASPECT_RATIO, .name = "DV Timings Aspect Ratio", .type = V4L2_CTRL_TYPE_MENU, .max = 3, .qmenu = tpg_aspect_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_max_edid_blocks = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_MAX_EDID_BLOCKS, .name = "Maximum EDID Blocks", .type = V4L2_CTRL_TYPE_INTEGER, .min = 1, .max = 256, .def = 2, .step = 1, }; static const char * const vivid_ctrl_colorspace_strings[] = { "SMPTE 170M", "Rec. 709", "sRGB", "opRGB", "BT.2020", "DCI-P3", "SMPTE 240M", "470 System M", "470 System BG", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_colorspace = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_COLORSPACE, .name = "Colorspace", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_colorspace_strings) - 2, .def = 2, .qmenu = vivid_ctrl_colorspace_strings, }; static const char * const vivid_ctrl_xfer_func_strings[] = { "Default", "Rec. 709", "sRGB", "opRGB", "SMPTE 240M", "None", "DCI-P3", "SMPTE 2084", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_xfer_func = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_XFER_FUNC, .name = "Transfer Function", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_xfer_func_strings) - 2, .qmenu = vivid_ctrl_xfer_func_strings, }; static const char * const vivid_ctrl_ycbcr_enc_strings[] = { "Default", "ITU-R 601", "Rec. 709", "xvYCC 601", "xvYCC 709", "", "BT.2020", "BT.2020 Constant Luminance", "SMPTE 240M", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_ycbcr_enc = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_YCBCR_ENC, .name = "Y'CbCr Encoding", .type = V4L2_CTRL_TYPE_MENU, .menu_skip_mask = 1 << 5, .max = ARRAY_SIZE(vivid_ctrl_ycbcr_enc_strings) - 2, .qmenu = vivid_ctrl_ycbcr_enc_strings, }; static const char * const vivid_ctrl_hsv_enc_strings[] = { "Hue 0-179", "Hue 0-256", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_hsv_enc = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HSV_ENC, .name = "HSV Encoding", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_hsv_enc_strings) - 2, .qmenu = vivid_ctrl_hsv_enc_strings, }; static const char * const vivid_ctrl_quantization_strings[] = { "Default", "Full Range", "Limited Range", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_quantization = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_QUANTIZATION, .name = "Quantization", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_quantization_strings) - 2, .qmenu = vivid_ctrl_quantization_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_alpha_mode = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_ALPHA_MODE, .name = "Apply Alpha To Red Only", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_limited_rgb_range = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_LIMITED_RGB_RANGE, .name = "Limited RGB Range (16-235)", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; /* VBI Capture Control */ static int vivid_vbi_cap_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_vbi_cap); switch (ctrl->id) { case VIVID_CID_VBI_CAP_INTERLACED: dev->vbi_cap_interlaced = ctrl->val; break; } return 0; } static const struct v4l2_ctrl_ops vivid_vbi_cap_ctrl_ops = { .s_ctrl = vivid_vbi_cap_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_vbi_cap_interlaced = { .ops = &vivid_vbi_cap_ctrl_ops, .id = VIVID_CID_VBI_CAP_INTERLACED, .name = "Interlaced VBI Format", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; /* Video Output Controls */ static int vivid_vid_out_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_vid_out); struct v4l2_bt_timings *bt = &dev->dv_timings_out.bt; switch (ctrl->id) { case VIVID_CID_HAS_CROP_OUT: dev->has_crop_out = ctrl->val; vivid_update_format_out(dev); break; case VIVID_CID_HAS_COMPOSE_OUT: dev->has_compose_out = ctrl->val; vivid_update_format_out(dev); break; case VIVID_CID_HAS_SCALER_OUT: dev->has_scaler_out = ctrl->val; vivid_update_format_out(dev); break; case V4L2_CID_DV_TX_MODE: dev->dvi_d_out = ctrl->val == V4L2_DV_TX_MODE_DVI_D; if (!vivid_is_hdmi_out(dev)) break; if (!dev->dvi_d_out && (bt->flags & V4L2_DV_FL_IS_CE_VIDEO)) { if (bt->width == 720 && bt->height <= 576) dev->colorspace_out = V4L2_COLORSPACE_SMPTE170M; else dev->colorspace_out = V4L2_COLORSPACE_REC709; dev->quantization_out = V4L2_QUANTIZATION_DEFAULT; } else { dev->colorspace_out = V4L2_COLORSPACE_SRGB; dev->quantization_out = dev->dvi_d_out ? V4L2_QUANTIZATION_LIM_RANGE : V4L2_QUANTIZATION_DEFAULT; } if (vivid_output_is_connected_to(dev)) { struct vivid_dev *dev_rx = vivid_output_is_connected_to(dev); vivid_send_source_change(dev_rx, HDMI); } break; } return 0; } static const struct v4l2_ctrl_ops vivid_vid_out_ctrl_ops = { .s_ctrl = vivid_vid_out_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_has_crop_out = { .ops = &vivid_vid_out_ctrl_ops, .id = VIVID_CID_HAS_CROP_OUT, .name = "Enable Output Cropping", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_has_compose_out = { .ops = &vivid_vid_out_ctrl_ops, .id = VIVID_CID_HAS_COMPOSE_OUT, .name = "Enable Output Composing", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_has_scaler_out = { .ops = &vivid_vid_out_ctrl_ops, .id = VIVID_CID_HAS_SCALER_OUT, .name = "Enable Output Scaler", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; /* Streaming Controls */ static int vivid_streaming_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_streaming); switch (ctrl->id) { case VIVID_CID_DQBUF_ERROR: dev->dqbuf_error = true; break; case VIVID_CID_PERC_DROPPED: dev->perc_dropped_buffers = ctrl->val; break; case VIVID_CID_QUEUE_SETUP_ERROR: dev->queue_setup_error = true; break; case VIVID_CID_BUF_PREPARE_ERROR: dev->buf_prepare_error = true; break; case VIVID_CID_START_STR_ERROR: dev->start_streaming_error = true; break; case VIVID_CID_REQ_VALIDATE_ERROR: dev->req_validate_error = true; break; case VIVID_CID_QUEUE_ERROR: if (vb2_start_streaming_called(&dev->vb_vid_cap_q)) vb2_queue_error(&dev->vb_vid_cap_q); if (vb2_start_streaming_called(&dev->vb_vbi_cap_q)) vb2_queue_error(&dev->vb_vbi_cap_q); if (vb2_start_streaming_called(&dev->vb_vid_out_q)) vb2_queue_error(&dev->vb_vid_out_q); if (vb2_start_streaming_called(&dev->vb_vbi_out_q)) vb2_queue_error(&dev->vb_vbi_out_q); if (vb2_start_streaming_called(&dev->vb_sdr_cap_q)) vb2_queue_error(&dev->vb_sdr_cap_q); break; case VIVID_CID_SEQ_WRAP: dev->seq_wrap = ctrl->val; break; case VIVID_CID_TIME_WRAP: dev->time_wrap = ctrl->val; if (dev->time_wrap == 1) dev->time_wrap = (1ULL << 63) - NSEC_PER_SEC * 16ULL; else if (dev->time_wrap == 2) dev->time_wrap = ((1ULL << 31) - 16) * NSEC_PER_SEC; break; } return 0; } static const struct v4l2_ctrl_ops vivid_streaming_ctrl_ops = { .s_ctrl = vivid_streaming_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_dqbuf_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_DQBUF_ERROR, .name = "Inject V4L2_BUF_FLAG_ERROR", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_ctrl_config vivid_ctrl_perc_dropped = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_PERC_DROPPED, .name = "Percentage of Dropped Buffers", .type = V4L2_CTRL_TYPE_INTEGER, .min = 0, .max = 100, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_queue_setup_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_QUEUE_SETUP_ERROR, .name = "Inject VIDIOC_REQBUFS Error", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_ctrl_config vivid_ctrl_buf_prepare_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_BUF_PREPARE_ERROR, .name = "Inject VIDIOC_QBUF Error", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_ctrl_config vivid_ctrl_start_streaming_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_START_STR_ERROR, .name = "Inject VIDIOC_STREAMON Error", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_ctrl_config vivid_ctrl_queue_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_QUEUE_ERROR, .name = "Inject Fatal Streaming Error", .type = V4L2_CTRL_TYPE_BUTTON, }; #ifdef CONFIG_MEDIA_CONTROLLER static const struct v4l2_ctrl_config vivid_ctrl_req_validate_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_REQ_VALIDATE_ERROR, .name = "Inject req_validate() Error", .type = V4L2_CTRL_TYPE_BUTTON, }; #endif static const struct v4l2_ctrl_config vivid_ctrl_seq_wrap = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_SEQ_WRAP, .name = "Wrap Sequence Number", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const char * const vivid_ctrl_time_wrap_strings[] = { "None", "64 Bit", "32 Bit", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_time_wrap = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_TIME_WRAP, .name = "Wrap Timestamp", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_time_wrap_strings) - 2, .qmenu = vivid_ctrl_time_wrap_strings, }; /* SDTV Capture Controls */ static int vivid_sdtv_cap_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_sdtv_cap); switch (ctrl->id) { case VIVID_CID_STD_SIGNAL_MODE: dev->std_signal_mode[dev->input] = dev->ctrl_std_signal_mode->val; if (dev->std_signal_mode[dev->input] == SELECTED_STD) dev->query_std[dev->input] = vivid_standard[dev->ctrl_standard->val]; v4l2_ctrl_activate(dev->ctrl_standard, dev->std_signal_mode[dev->input] == SELECTED_STD); vivid_update_quality(dev); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); break; } return 0; } static const struct v4l2_ctrl_ops vivid_sdtv_cap_ctrl_ops = { .s_ctrl = vivid_sdtv_cap_s_ctrl, }; static const char * const vivid_ctrl_std_signal_mode_strings[] = { "Current Standard", "No Signal", "No Lock", "", "Selected Standard", "Cycle Through All Standards", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_std_signal_mode = { .ops = &vivid_sdtv_cap_ctrl_ops, .id = VIVID_CID_STD_SIGNAL_MODE, .name = "Standard Signal Mode", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_std_signal_mode_strings) - 2, .menu_skip_mask = 1 << 3, .qmenu = vivid_ctrl_std_signal_mode_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_standard = { .ops = &vivid_sdtv_cap_ctrl_ops, .id = VIVID_CID_STANDARD, .name = "Standard", .type = V4L2_CTRL_TYPE_MENU, .max = 14, .qmenu = vivid_ctrl_standard_strings, }; /* Radio Receiver Controls */ static int vivid_radio_rx_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_radio_rx); switch (ctrl->id) { case VIVID_CID_RADIO_SEEK_MODE: dev->radio_rx_hw_seek_mode = ctrl->val; break; case VIVID_CID_RADIO_SEEK_PROG_LIM: dev->radio_rx_hw_seek_prog_lim = ctrl->val; break; case VIVID_CID_RADIO_RX_RDS_RBDS: dev->rds_gen.use_rbds = ctrl->val; break; case VIVID_CID_RADIO_RX_RDS_BLOCKIO: dev->radio_rx_rds_controls = ctrl->val; dev->radio_rx_caps &= ~V4L2_CAP_READWRITE; dev->radio_rx_rds_use_alternates = false; if (!dev->radio_rx_rds_controls) { dev->radio_rx_caps |= V4L2_CAP_READWRITE; __v4l2_ctrl_s_ctrl(dev->radio_rx_rds_pty, 0); __v4l2_ctrl_s_ctrl(dev->radio_rx_rds_ta, 0); __v4l2_ctrl_s_ctrl(dev->radio_rx_rds_tp, 0); __v4l2_ctrl_s_ctrl(dev->radio_rx_rds_ms, 0); __v4l2_ctrl_s_ctrl_string(dev->radio_rx_rds_psname, ""); __v4l2_ctrl_s_ctrl_string(dev->radio_rx_rds_radiotext, ""); } v4l2_ctrl_activate(dev->radio_rx_rds_pty, dev->radio_rx_rds_controls); v4l2_ctrl_activate(dev->radio_rx_rds_psname, dev->radio_rx_rds_controls); v4l2_ctrl_activate(dev->radio_rx_rds_radiotext, dev->radio_rx_rds_controls); v4l2_ctrl_activate(dev->radio_rx_rds_ta, dev->radio_rx_rds_controls); v4l2_ctrl_activate(dev->radio_rx_rds_tp, dev->radio_rx_rds_controls); v4l2_ctrl_activate(dev->radio_rx_rds_ms, dev->radio_rx_rds_controls); dev->radio_rx_dev.device_caps = dev->radio_rx_caps; break; case V4L2_CID_RDS_RECEPTION: dev->radio_rx_rds_enabled = ctrl->val; break; } return 0; } static const struct v4l2_ctrl_ops vivid_radio_rx_ctrl_ops = { .s_ctrl = vivid_radio_rx_s_ctrl, }; static const char * const vivid_ctrl_radio_rds_mode_strings[] = { "Block I/O", "Controls", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_radio_rx_rds_blockio = { .ops = &vivid_radio_rx_ctrl_ops, .id = VIVID_CID_RADIO_RX_RDS_BLOCKIO, .name = "RDS Rx I/O Mode", .type = V4L2_CTRL_TYPE_MENU, .qmenu = vivid_ctrl_radio_rds_mode_strings, .max = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_radio_rx_rds_rbds = { .ops = &vivid_radio_rx_ctrl_ops, .id = VIVID_CID_RADIO_RX_RDS_RBDS, .name = "Generate RBDS Instead of RDS", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const char * const vivid_ctrl_radio_hw_seek_mode_strings[] = { "Bounded", "Wrap Around", "Both", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_radio_hw_seek_mode = { .ops = &vivid_radio_rx_ctrl_ops, .id = VIVID_CID_RADIO_SEEK_MODE, .name = "Radio HW Seek Mode", .type = V4L2_CTRL_TYPE_MENU, .max = 2, .qmenu = vivid_ctrl_radio_hw_seek_mode_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_radio_hw_seek_prog_lim = { .ops = &vivid_radio_rx_ctrl_ops, .id = VIVID_CID_RADIO_SEEK_PROG_LIM, .name = "Radio Programmable HW Seek", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; /* Radio Transmitter Controls */ static int vivid_radio_tx_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_radio_tx); switch (ctrl->id) { case VIVID_CID_RADIO_TX_RDS_BLOCKIO: dev->radio_tx_rds_controls = ctrl->val; dev->radio_tx_caps &= ~V4L2_CAP_READWRITE; if (!dev->radio_tx_rds_controls) dev->radio_tx_caps |= V4L2_CAP_READWRITE; dev->radio_tx_dev.device_caps = dev->radio_tx_caps; break; case V4L2_CID_RDS_TX_PTY: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl(dev->radio_rx_rds_pty, ctrl->val); break; case V4L2_CID_RDS_TX_PS_NAME: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl_string(dev->radio_rx_rds_psname, ctrl->p_new.p_char); break; case V4L2_CID_RDS_TX_RADIO_TEXT: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl_string(dev->radio_rx_rds_radiotext, ctrl->p_new.p_char); break; case V4L2_CID_RDS_TX_TRAFFIC_ANNOUNCEMENT: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl(dev->radio_rx_rds_ta, ctrl->val); break; case V4L2_CID_RDS_TX_TRAFFIC_PROGRAM: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl(dev->radio_rx_rds_tp, ctrl->val); break; case V4L2_CID_RDS_TX_MUSIC_SPEECH: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl(dev->radio_rx_rds_ms, ctrl->val); break; } return 0; } static const struct v4l2_ctrl_ops vivid_radio_tx_ctrl_ops = { .s_ctrl = vivid_radio_tx_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_radio_tx_rds_blockio = { .ops = &vivid_radio_tx_ctrl_ops, .id = VIVID_CID_RADIO_TX_RDS_BLOCKIO, .name = "RDS Tx I/O Mode", .type = V4L2_CTRL_TYPE_MENU, .qmenu = vivid_ctrl_radio_rds_mode_strings, .max = 1, .def = 1, }; /* SDR Capture Controls */ static int vivid_sdr_cap_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_sdr_cap); switch (ctrl->id) { case VIVID_CID_SDR_CAP_FM_DEVIATION: dev->sdr_fm_deviation = ctrl->val; break; } return 0; } static const struct v4l2_ctrl_ops vivid_sdr_cap_ctrl_ops = { .s_ctrl = vivid_sdr_cap_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_sdr_cap_fm_deviation = { .ops = &vivid_sdr_cap_ctrl_ops, .id = VIVID_CID_SDR_CAP_FM_DEVIATION, .name = "FM Deviation", .type = V4L2_CTRL_TYPE_INTEGER, .min = 100, .max = 200000, .def = 75000, .step = 1, }; /* Metadata Capture Control */ static int vivid_meta_cap_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_meta_cap); switch (ctrl->id) { case VIVID_CID_META_CAP_GENERATE_PTS: dev->meta_pts = ctrl->val; break; case VIVID_CID_META_CAP_GENERATE_SCR: dev->meta_scr = ctrl->val; break; } return 0; } static const struct v4l2_ctrl_ops vivid_meta_cap_ctrl_ops = { .s_ctrl = vivid_meta_cap_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_meta_has_pts = { .ops = &vivid_meta_cap_ctrl_ops, .id = VIVID_CID_META_CAP_GENERATE_PTS, .name = "Generate PTS", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_meta_has_src_clk = { .ops = &vivid_meta_cap_ctrl_ops, .id = VIVID_CID_META_CAP_GENERATE_SCR, .name = "Generate SCR", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_class = { .ops = &vivid_user_gen_ctrl_ops, .flags = V4L2_CTRL_FLAG_READ_ONLY | V4L2_CTRL_FLAG_WRITE_ONLY, .id = VIVID_CID_VIVID_CLASS, .name = "Vivid Controls", .type = V4L2_CTRL_TYPE_CTRL_CLASS, }; int vivid_create_controls(struct vivid_dev *dev, bool show_ccs_cap, bool show_ccs_out, bool no_error_inj, bool has_sdtv, bool has_hdmi) { struct v4l2_ctrl_handler *hdl_user_gen = &dev->ctrl_hdl_user_gen; struct v4l2_ctrl_handler *hdl_user_vid = &dev->ctrl_hdl_user_vid; struct v4l2_ctrl_handler *hdl_user_aud = &dev->ctrl_hdl_user_aud; struct v4l2_ctrl_handler *hdl_streaming = &dev->ctrl_hdl_streaming; struct v4l2_ctrl_handler *hdl_sdtv_cap = &dev->ctrl_hdl_sdtv_cap; struct v4l2_ctrl_handler *hdl_loop_cap = &dev->ctrl_hdl_loop_cap; struct v4l2_ctrl_handler *hdl_fb = &dev->ctrl_hdl_fb; struct v4l2_ctrl_handler *hdl_vid_cap = &dev->ctrl_hdl_vid_cap; struct v4l2_ctrl_handler *hdl_vid_out = &dev->ctrl_hdl_vid_out; struct v4l2_ctrl_handler *hdl_vbi_cap = &dev->ctrl_hdl_vbi_cap; struct v4l2_ctrl_handler *hdl_vbi_out = &dev->ctrl_hdl_vbi_out; struct v4l2_ctrl_handler *hdl_radio_rx = &dev->ctrl_hdl_radio_rx; struct v4l2_ctrl_handler *hdl_radio_tx = &dev->ctrl_hdl_radio_tx; struct v4l2_ctrl_handler *hdl_sdr_cap = &dev->ctrl_hdl_sdr_cap; struct v4l2_ctrl_handler *hdl_meta_cap = &dev->ctrl_hdl_meta_cap; struct v4l2_ctrl_handler *hdl_meta_out = &dev->ctrl_hdl_meta_out; struct v4l2_ctrl_handler *hdl_tch_cap = &dev->ctrl_hdl_touch_cap; struct v4l2_ctrl_config vivid_ctrl_dv_timings = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_DV_TIMINGS, .name = "DV Timings", .type = V4L2_CTRL_TYPE_MENU, }; int i; v4l2_ctrl_handler_init(hdl_user_gen, 10); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_user_vid, 9); v4l2_ctrl_new_custom(hdl_user_vid, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_user_aud, 2); v4l2_ctrl_new_custom(hdl_user_aud, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_streaming, 8); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_sdtv_cap, 2); v4l2_ctrl_new_custom(hdl_sdtv_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_loop_cap, 1); v4l2_ctrl_new_custom(hdl_loop_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_fb, 1); v4l2_ctrl_new_custom(hdl_fb, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_vid_cap, 55); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_vid_out, 26); if (!no_error_inj || dev->has_fb || dev->num_hdmi_outputs) v4l2_ctrl_new_custom(hdl_vid_out, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_vbi_cap, 21); v4l2_ctrl_new_custom(hdl_vbi_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_vbi_out, 19); if (!no_error_inj) v4l2_ctrl_new_custom(hdl_vbi_out, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_radio_rx, 17); v4l2_ctrl_new_custom(hdl_radio_rx, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_radio_tx, 17); v4l2_ctrl_new_custom(hdl_radio_tx, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_sdr_cap, 19); v4l2_ctrl_new_custom(hdl_sdr_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_meta_cap, 2); v4l2_ctrl_new_custom(hdl_meta_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_meta_out, 2); v4l2_ctrl_new_custom(hdl_meta_out, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_tch_cap, 2); v4l2_ctrl_new_custom(hdl_tch_cap, &vivid_ctrl_class, NULL); /* User Controls */ dev->volume = v4l2_ctrl_new_std(hdl_user_aud, NULL, V4L2_CID_AUDIO_VOLUME, 0, 255, 1, 200); dev->mute = v4l2_ctrl_new_std(hdl_user_aud, NULL, V4L2_CID_AUDIO_MUTE, 0, 1, 1, 0); if (dev->has_vid_cap) { dev->brightness = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_BRIGHTNESS, 0, 255, 1, 128); for (i = 0; i < MAX_INPUTS; i++) dev->input_brightness[i] = 128; dev->contrast = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_CONTRAST, 0, 255, 1, 128); dev->saturation = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_SATURATION, 0, 255, 1, 128); dev->hue = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_HUE, -128, 128, 1, 0); v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_HFLIP, 0, 1, 1, 0); v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_VFLIP, 0, 1, 1, 0); dev->autogain = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_AUTOGAIN, 0, 1, 1, 1); dev->gain = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_GAIN, 0, 255, 1, 100); dev->alpha = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_ALPHA_COMPONENT, 0, 255, 1, 0); } dev->button = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_button, NULL); dev->int32 = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_int32, NULL); dev->int64 = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_int64, NULL); dev->boolean = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_boolean, NULL); dev->menu = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_menu, NULL); dev->string = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_string, NULL); dev->bitmask = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_bitmask, NULL); dev->int_menu = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_int_menu, NULL); dev->ro_int32 = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_ro_int32, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_area, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_rect, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_u32_array, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_u32_dyn_array, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_u16_matrix, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_u8_4d_array, NULL); dev->pixel_array = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_u8_pixel_array, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_s32_array, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_s64_array, NULL); if (dev->has_vid_cap) { /* Image Processing Controls */ struct v4l2_ctrl_config vivid_ctrl_test_pattern = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_TEST_PATTERN, .name = "Test Pattern", .type = V4L2_CTRL_TYPE_MENU, .max = TPG_PAT_NOISE, .qmenu = tpg_pattern_strings, }; dev->test_pattern = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_test_pattern, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_perc_fill, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_hor_movement, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_vert_movement, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_osd_mode, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_show_border, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_show_square, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_hflip, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_vflip, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_insert_sav, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_insert_eav, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_insert_hdmi_video_guard_band, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_reduced_fps, NULL); WARN_ON(dev->num_hdmi_inputs > MAX_HDMI_INPUTS); WARN_ON(dev->num_svid_inputs > MAX_SVID_INPUTS); for (u8 i = 0; i < dev->num_hdmi_inputs; i++) { snprintf(dev->ctrl_hdmi_to_output_names[i], sizeof(dev->ctrl_hdmi_to_output_names[i]), "HDMI %03u-%u Is Connected To", dev->inst, i); } for (u8 i = 0; i < dev->num_hdmi_inputs; i++) { struct v4l2_ctrl_config ctrl_config = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HDMI_IS_CONNECTED_TO_OUTPUT(i), .name = dev->ctrl_hdmi_to_output_names[i], .type = V4L2_CTRL_TYPE_MENU, .max = 1, .qmenu = (const char * const *)vivid_ctrl_hdmi_to_output_strings, }; dev->ctrl_hdmi_to_output[i] = v4l2_ctrl_new_custom(hdl_vid_cap, &ctrl_config, NULL); } for (u8 i = 0; i < dev->num_svid_inputs; i++) { snprintf(dev->ctrl_svid_to_output_names[i], sizeof(dev->ctrl_svid_to_output_names[i]), "S-Video %03u-%u Is Connected To", dev->inst, i); } for (u8 i = 0; i < dev->num_svid_inputs; i++) { struct v4l2_ctrl_config ctrl_config = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_SVID_IS_CONNECTED_TO_OUTPUT(i), .name = dev->ctrl_svid_to_output_names[i], .type = V4L2_CTRL_TYPE_MENU, .max = 1, .qmenu = (const char * const *)vivid_ctrl_svid_to_output_strings, }; dev->ctrl_svid_to_output[i] = v4l2_ctrl_new_custom(hdl_vid_cap, &ctrl_config, NULL); } if (show_ccs_cap) { dev->ctrl_has_crop_cap = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_has_crop_cap, NULL); dev->ctrl_has_compose_cap = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_has_compose_cap, NULL); dev->ctrl_has_scaler_cap = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_has_scaler_cap, NULL); } v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_tstamp_src, NULL); dev->colorspace = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_colorspace, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_xfer_func, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_ycbcr_enc, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_hsv_enc, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_quantization, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_alpha_mode, NULL); } if (dev->has_vid_out && show_ccs_out) { dev->ctrl_has_crop_out = v4l2_ctrl_new_custom(hdl_vid_out, &vivid_ctrl_has_crop_out, NULL); dev->ctrl_has_compose_out = v4l2_ctrl_new_custom(hdl_vid_out, &vivid_ctrl_has_compose_out, NULL); dev->ctrl_has_scaler_out = v4l2_ctrl_new_custom(hdl_vid_out, &vivid_ctrl_has_scaler_out, NULL); } /* * Testing this driver with v4l2-compliance will trigger the error * injection controls, and after that nothing will work as expected. * So we have a module option to drop these error injecting controls * allowing us to run v4l2_compliance again. */ if (!no_error_inj) { v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_disconnect, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_dqbuf_error, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_perc_dropped, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_queue_setup_error, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_buf_prepare_error, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_start_streaming_error, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_queue_error, NULL); #ifdef CONFIG_MEDIA_CONTROLLER v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_req_validate_error, NULL); #endif v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_seq_wrap, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_time_wrap, NULL); } if (has_sdtv && (dev->has_vid_cap || dev->has_vbi_cap)) { if (dev->has_vid_cap) v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_std_aspect_ratio, NULL); dev->ctrl_std_signal_mode = v4l2_ctrl_new_custom(hdl_sdtv_cap, &vivid_ctrl_std_signal_mode, NULL); dev->ctrl_standard = v4l2_ctrl_new_custom(hdl_sdtv_cap, &vivid_ctrl_standard, NULL); if (dev->ctrl_std_signal_mode) v4l2_ctrl_cluster(2, &dev->ctrl_std_signal_mode); if (dev->has_raw_vbi_cap) v4l2_ctrl_new_custom(hdl_vbi_cap, &vivid_ctrl_vbi_cap_interlaced, NULL); } if (dev->num_hdmi_inputs) { s64 hdmi_input_mask = GENMASK(dev->num_hdmi_inputs - 1, 0); dev->ctrl_dv_timings_signal_mode = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_dv_timings_signal_mode, NULL); vivid_ctrl_dv_timings.max = dev->query_dv_timings_size - 1; vivid_ctrl_dv_timings.qmenu = (const char * const *)dev->query_dv_timings_qmenu; dev->ctrl_dv_timings = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_dv_timings, NULL); if (dev->ctrl_dv_timings_signal_mode) v4l2_ctrl_cluster(2, &dev->ctrl_dv_timings_signal_mode); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_dv_timings_aspect_ratio, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_max_edid_blocks, NULL); dev->real_rgb_range_cap = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_limited_rgb_range, NULL); dev->rgb_range_cap = v4l2_ctrl_new_std_menu(hdl_vid_cap, &vivid_vid_cap_ctrl_ops, V4L2_CID_DV_RX_RGB_RANGE, V4L2_DV_RGB_RANGE_FULL, 0, V4L2_DV_RGB_RANGE_AUTO); dev->ctrl_rx_power_present = v4l2_ctrl_new_std(hdl_vid_cap, NULL, V4L2_CID_DV_RX_POWER_PRESENT, 0, hdmi_input_mask, 0, hdmi_input_mask); } if (dev->num_hdmi_outputs) { s64 hdmi_output_mask = GENMASK(dev->num_hdmi_outputs - 1, 0); /* * We aren't doing anything with this at the moment, but * HDMI outputs typically have this controls. */ dev->ctrl_tx_rgb_range = v4l2_ctrl_new_std_menu(hdl_vid_out, NULL, V4L2_CID_DV_TX_RGB_RANGE, V4L2_DV_RGB_RANGE_FULL, 0, V4L2_DV_RGB_RANGE_AUTO); dev->ctrl_tx_mode = v4l2_ctrl_new_std_menu(hdl_vid_out, NULL, V4L2_CID_DV_TX_MODE, V4L2_DV_TX_MODE_HDMI, 0, V4L2_DV_TX_MODE_HDMI); dev->ctrl_tx_hotplug = v4l2_ctrl_new_std(hdl_vid_out, NULL, V4L2_CID_DV_TX_HOTPLUG, 0, hdmi_output_mask, 0, 0); dev->ctrl_tx_rxsense = v4l2_ctrl_new_std(hdl_vid_out, NULL, V4L2_CID_DV_TX_RXSENSE, 0, hdmi_output_mask, 0, 0); dev->ctrl_tx_edid_present = v4l2_ctrl_new_std(hdl_vid_out, NULL, V4L2_CID_DV_TX_EDID_PRESENT, 0, hdmi_output_mask, 0, 0); } if (dev->has_fb) v4l2_ctrl_new_custom(hdl_fb, &vivid_ctrl_clear_fb, NULL); if (dev->has_radio_rx) { v4l2_ctrl_new_custom(hdl_radio_rx, &vivid_ctrl_radio_hw_seek_mode, NULL); v4l2_ctrl_new_custom(hdl_radio_rx, &vivid_ctrl_radio_hw_seek_prog_lim, NULL); v4l2_ctrl_new_custom(hdl_radio_rx, &vivid_ctrl_radio_rx_rds_blockio, NULL); v4l2_ctrl_new_custom(hdl_radio_rx, &vivid_ctrl_radio_rx_rds_rbds, NULL); v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RECEPTION, 0, 1, 1, 1); dev->radio_rx_rds_pty = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_PTY, 0, 31, 1, 0); dev->radio_rx_rds_psname = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_PS_NAME, 0, 8, 8, 0); dev->radio_rx_rds_radiotext = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_RADIO_TEXT, 0, 64, 64, 0); dev->radio_rx_rds_ta = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_TRAFFIC_ANNOUNCEMENT, 0, 1, 1, 0); dev->radio_rx_rds_tp = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_TRAFFIC_PROGRAM, 0, 1, 1, 0); dev->radio_rx_rds_ms = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_MUSIC_SPEECH, 0, 1, 1, 1); } if (dev->has_radio_tx) { v4l2_ctrl_new_custom(hdl_radio_tx, &vivid_ctrl_radio_tx_rds_blockio, NULL); dev->radio_tx_rds_pi = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_PI, 0, 0xffff, 1, 0x8088); dev->radio_tx_rds_pty = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_PTY, 0, 31, 1, 3); dev->radio_tx_rds_psname = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_PS_NAME, 0, 8, 8, 0); if (dev->radio_tx_rds_psname) v4l2_ctrl_s_ctrl_string(dev->radio_tx_rds_psname, "VIVID-TX"); dev->radio_tx_rds_radiotext = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_RADIO_TEXT, 0, 64 * 2, 64, 0); if (dev->radio_tx_rds_radiotext) v4l2_ctrl_s_ctrl_string(dev->radio_tx_rds_radiotext, "This is a VIVID default Radio Text template text, change at will"); dev->radio_tx_rds_mono_stereo = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_MONO_STEREO, 0, 1, 1, 1); dev->radio_tx_rds_art_head = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_ARTIFICIAL_HEAD, 0, 1, 1, 0); dev->radio_tx_rds_compressed = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_COMPRESSED, 0, 1, 1, 0); dev->radio_tx_rds_dyn_pty = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_DYNAMIC_PTY, 0, 1, 1, 0); dev->radio_tx_rds_ta = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_TRAFFIC_ANNOUNCEMENT, 0, 1, 1, 0); dev->radio_tx_rds_tp = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_TRAFFIC_PROGRAM, 0, 1, 1, 1); dev->radio_tx_rds_ms = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_MUSIC_SPEECH, 0, 1, 1, 1); } if (dev->has_sdr_cap) { v4l2_ctrl_new_custom(hdl_sdr_cap, &vivid_ctrl_sdr_cap_fm_deviation, NULL); } if (dev->has_meta_cap) { v4l2_ctrl_new_custom(hdl_meta_cap, &vivid_ctrl_meta_has_pts, NULL); v4l2_ctrl_new_custom(hdl_meta_cap, &vivid_ctrl_meta_has_src_clk, NULL); } if (hdl_user_gen->error) return hdl_user_gen->error; if (hdl_user_vid->error) return hdl_user_vid->error; if (hdl_user_aud->error) return hdl_user_aud->error; if (hdl_streaming->error) return hdl_streaming->error; if (hdl_sdr_cap->error) return hdl_sdr_cap->error; if (hdl_loop_cap->error) return hdl_loop_cap->error; if (dev->autogain) v4l2_ctrl_auto_cluster(2, &dev->autogain, 0, true); if (dev->has_vid_cap) { v4l2_ctrl_add_handler(hdl_vid_cap, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_user_vid, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_user_aud, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_streaming, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_sdtv_cap, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_loop_cap, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_fb, NULL, false); if (hdl_vid_cap->error) return hdl_vid_cap->error; dev->vid_cap_dev.ctrl_handler = hdl_vid_cap; } if (dev->has_vid_out) { v4l2_ctrl_add_handler(hdl_vid_out, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_vid_out, hdl_user_aud, NULL, false); v4l2_ctrl_add_handler(hdl_vid_out, hdl_streaming, NULL, false); v4l2_ctrl_add_handler(hdl_vid_out, hdl_fb, NULL, false); if (hdl_vid_out->error) return hdl_vid_out->error; dev->vid_out_dev.ctrl_handler = hdl_vid_out; } if (dev->has_vbi_cap) { v4l2_ctrl_add_handler(hdl_vbi_cap, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_vbi_cap, hdl_streaming, NULL, false); v4l2_ctrl_add_handler(hdl_vbi_cap, hdl_sdtv_cap, NULL, false); v4l2_ctrl_add_handler(hdl_vbi_cap, hdl_loop_cap, NULL, false); if (hdl_vbi_cap->error) return hdl_vbi_cap->error; dev->vbi_cap_dev.ctrl_handler = hdl_vbi_cap; } if (dev->has_vbi_out) { v4l2_ctrl_add_handler(hdl_vbi_out, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_vbi_out, hdl_streaming, NULL, false); if (hdl_vbi_out->error) return hdl_vbi_out->error; dev->vbi_out_dev.ctrl_handler = hdl_vbi_out; } if (dev->has_radio_rx) { v4l2_ctrl_add_handler(hdl_radio_rx, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_radio_rx, hdl_user_aud, NULL, false); if (hdl_radio_rx->error) return hdl_radio_rx->error; dev->radio_rx_dev.ctrl_handler = hdl_radio_rx; } if (dev->has_radio_tx) { v4l2_ctrl_add_handler(hdl_radio_tx, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_radio_tx, hdl_user_aud, NULL, false); if (hdl_radio_tx->error) return hdl_radio_tx->error; dev->radio_tx_dev.ctrl_handler = hdl_radio_tx; } if (dev->has_sdr_cap) { v4l2_ctrl_add_handler(hdl_sdr_cap, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_sdr_cap, hdl_streaming, NULL, false); if (hdl_sdr_cap->error) return hdl_sdr_cap->error; dev->sdr_cap_dev.ctrl_handler = hdl_sdr_cap; } if (dev->has_meta_cap) { v4l2_ctrl_add_handler(hdl_meta_cap, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_meta_cap, hdl_streaming, NULL, false); if (hdl_meta_cap->error) return hdl_meta_cap->error; dev->meta_cap_dev.ctrl_handler = hdl_meta_cap; } if (dev->has_meta_out) { v4l2_ctrl_add_handler(hdl_meta_out, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_meta_out, hdl_streaming, NULL, false); if (hdl_meta_out->error) return hdl_meta_out->error; dev->meta_out_dev.ctrl_handler = hdl_meta_out; } if (dev->has_touch_cap) { v4l2_ctrl_add_handler(hdl_tch_cap, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_tch_cap, hdl_streaming, NULL, false); if (hdl_tch_cap->error) return hdl_tch_cap->error; dev->touch_cap_dev.ctrl_handler = hdl_tch_cap; } return 0; } void vivid_free_controls(struct vivid_dev *dev) { v4l2_ctrl_handler_free(&dev->ctrl_hdl_vid_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_vid_out); v4l2_ctrl_handler_free(&dev->ctrl_hdl_vbi_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_vbi_out); v4l2_ctrl_handler_free(&dev->ctrl_hdl_radio_rx); v4l2_ctrl_handler_free(&dev->ctrl_hdl_radio_tx); v4l2_ctrl_handler_free(&dev->ctrl_hdl_sdr_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_user_gen); v4l2_ctrl_handler_free(&dev->ctrl_hdl_user_vid); v4l2_ctrl_handler_free(&dev->ctrl_hdl_user_aud); v4l2_ctrl_handler_free(&dev->ctrl_hdl_streaming); v4l2_ctrl_handler_free(&dev->ctrl_hdl_sdtv_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_loop_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_fb); v4l2_ctrl_handler_free(&dev->ctrl_hdl_meta_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_meta_out); v4l2_ctrl_handler_free(&dev->ctrl_hdl_touch_cap); } |
| 5315 3971 2319 53 60 6 6 16 200 262 1 73 5 191 188 157 84 182 183 183 7 8 7 13 13 13 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | // SPDX-License-Identifier: GPL-2.0 /* * Kernel internal schedule timeout and sleeping functions */ #include <linux/delay.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include "tick-internal.h" /* * Since schedule_timeout()'s timer is defined on the stack, it must store * the target task on the stack as well. */ struct process_timer { struct timer_list timer; struct task_struct *task; }; static void process_timeout(struct timer_list *t) { struct process_timer *timeout = from_timer(timeout, t, timer); wake_up_process(timeout->task); } /** * schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * * Make the current task sleep until @timeout jiffies have elapsed. * The function behavior depends on the current task state * (see also set_current_state() description): * * %TASK_RUNNING - the scheduler is called, but the task does not sleep * at all. That happens because sched_submit_work() does nothing for * tasks in %TASK_RUNNING state. * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be %TASK_RUNNING when this * routine returns. * * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule * the CPU away without a bound on the timeout. In this case the return * value will be %MAX_SCHEDULE_TIMEOUT. * * Returns: 0 when the timer has expired otherwise the remaining time in * jiffies will be returned. In all cases the return value is guaranteed * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) { struct process_timer timer; unsigned long expire; switch (timeout) { case MAX_SCHEDULE_TIMEOUT: /* * These two special cases are useful to be comfortable * in the caller. Nothing more. We could take * MAX_SCHEDULE_TIMEOUT from one of the negative value * but I' d like to return a valid offset (>=0) to allow * the caller to do everything it want with the retval. */ schedule(); goto out; default: /* * Another bit of PARANOID. Note that the retval will be * 0 since no piece of kernel is supposed to do a check * for a negative retval of schedule_timeout() (since it * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ if (timeout < 0) { pr_err("%s: wrong timeout value %lx\n", __func__, timeout); dump_stack(); __set_current_state(TASK_RUNNING); goto out; } } expire = timeout + jiffies; timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); timer.timer.expires = expire; add_timer(&timer.timer); schedule(); timer_delete_sync(&timer.timer); /* Remove the timer from the object tracker */ destroy_timer_on_stack(&timer.timer); timeout = expire - jiffies; out: return timeout < 0 ? 0 : timeout; } EXPORT_SYMBOL(schedule_timeout); /* * __set_current_state() can be used in schedule_timeout_*() functions, because * schedule_timeout() calls schedule() unconditionally. */ /** * schedule_timeout_interruptible - sleep until timeout (interruptible) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_INTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_interruptible(signed long timeout) { __set_current_state(TASK_INTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_interruptible); /** * schedule_timeout_killable - sleep until timeout (killable) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_KILLABLE before starting the timeout. */ signed long __sched schedule_timeout_killable(signed long timeout) { __set_current_state(TASK_KILLABLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_killable); /** * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_uninterruptible(signed long timeout) { __set_current_state(TASK_UNINTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_uninterruptible); /** * schedule_timeout_idle - sleep until timeout (idle) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_IDLE before starting the timeout. It is similar to * schedule_timeout_uninterruptible(), except this task will not contribute to * load average. */ signed long __sched schedule_timeout_idle(signed long timeout) { __set_current_state(TASK_IDLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_idle); /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used * * Details are explained in schedule_hrtimeout_range() function description as * this function is commonly used. */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; /* * Optimize when a zero timeout value is given. It does not * matter whether this is an absolute or a relative time. */ if (expires && *expires == 0) { __set_current_state(TASK_RUNNING); return 0; } /* * A NULL parameter means "infinite" */ if (!expires) { schedule(); return -EINTR; } hrtimer_setup_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); if (likely(t.task)) schedule(); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); __set_current_state(TASK_RUNNING); return !t.task ? 0 : -EINTR; } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * The @delta argument gives the kernel the freedom to schedule the * actual wakeup to a time that is both power and performance friendly * for regular (non RT/DL) tasks. * The kernel give the normal best effort behavior for "@expires+@delta", * but may decide to fire the timer earlier, but no earlier than @expires. * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns: 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, CLOCK_MONOTONIC); } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) * @mode: timer mode * * See schedule_hrtimeout_range() for details. @delta argument of * schedule_hrtimeout_range() is set to 0 and has therefore no impact. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) { return schedule_hrtimeout_range(expires, 0, mode); } EXPORT_SYMBOL_GPL(schedule_hrtimeout); /** * msleep - sleep safely even with waitqueue interruptions * @msecs: Requested sleep duration in milliseconds * * msleep() uses jiffy based timeouts for the sleep duration. Because of the * design of the timer wheel, the maximum additional percentage delay (slack) is * 12.5%. This is only valid for timers which will end up in level 1 or a higher * level of the timer wheel. For explanation of those 12.5% please check the * detailed description about the basics of the timer wheel. * * The slack of timers which will end up in level 0 depends on sleep duration * (msecs) and HZ configuration and can be calculated in the following way (with * the timer wheel design restriction that the slack is not less than 12.5%): * * ``slack = MSECS_PER_TICK / msecs`` * * When the allowed slack of the callsite is known, the calculation could be * turned around to find the minimal allowed sleep duration to meet the * constraints. For example: * * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``: * all sleep durations greater or equal 4ms will meet the constraints. * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``: * all sleep durations greater or equal 8ms will meet the constraints. * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``: * all sleep durations greater or equal 16ms will meet the constraints. * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``: * all sleep durations greater or equal 32ms will meet the constraints. * * See also the signal aware variant msleep_interruptible(). */ void msleep(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs); while (timeout) timeout = schedule_timeout_uninterruptible(timeout); } EXPORT_SYMBOL(msleep); /** * msleep_interruptible - sleep waiting for signals * @msecs: Requested sleep duration in milliseconds * * See msleep() for some basic information. * * The difference between msleep() and msleep_interruptible() is that the sleep * could be interrupted by a signal delivery and then returns early. * * Returns: The remaining time of the sleep duration transformed to msecs (see * schedule_timeout() for details). */ unsigned long msleep_interruptible(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs); while (timeout && !signal_pending(current)) timeout = schedule_timeout_interruptible(timeout); return jiffies_to_msecs(timeout); } EXPORT_SYMBOL(msleep_interruptible); /** * usleep_range_state - Sleep for an approximate time in a given state * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep * @state: State of the current task that will be while sleeping * * usleep_range_state() sleeps at least for the minimum specified time but not * longer than the maximum specified amount of time. The range might reduce * power usage by allowing hrtimers to coalesce an already scheduled interrupt * with this hrtimer. In the worst case, an interrupt is scheduled for the upper * bound. * * The sleeping task is set to the specified state before starting the sleep. * * In non-atomic context where the exact wakeup time is flexible, use * usleep_range() or its variants instead of udelay(). The sleep improves * responsiveness by avoiding the CPU-hogging busy-wait of udelay(). */ void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; if (WARN_ON_ONCE(max < min)) delta = 0; for (;;) { __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } EXPORT_SYMBOL(usleep_range_state); |
| 10 7 3 8 1 1 2 1 2 2 1 2 1 1 2 5 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> struct nft_last { unsigned long jiffies; unsigned int set; }; struct nft_last_priv { struct nft_last *last; }; static const struct nla_policy nft_last_policy[NFTA_LAST_MAX + 1] = { [NFTA_LAST_SET] = { .type = NLA_U32 }, [NFTA_LAST_MSECS] = { .type = NLA_U64 }, }; static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_last_priv *priv = nft_expr_priv(expr); struct nft_last *last; u64 last_jiffies; int err; last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT); if (!last) return -ENOMEM; if (tb[NFTA_LAST_SET]) last->set = ntohl(nla_get_be32(tb[NFTA_LAST_SET])); if (last->set && tb[NFTA_LAST_MSECS]) { err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies); if (err < 0) goto err; last->jiffies = jiffies - (unsigned long)last_jiffies; } priv->last = last; return 0; err: kfree(last); return err; } static void nft_last_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_last_priv *priv = nft_expr_priv(expr); struct nft_last *last = priv->last; if (READ_ONCE(last->jiffies) != jiffies) WRITE_ONCE(last->jiffies, jiffies); if (READ_ONCE(last->set) == 0) WRITE_ONCE(last->set, 1); } static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_last_priv *priv = nft_expr_priv(expr); struct nft_last *last = priv->last; unsigned long last_jiffies = READ_ONCE(last->jiffies); u32 last_set = READ_ONCE(last->set); __be64 msecs; if (time_before(jiffies, last_jiffies)) { WRITE_ONCE(last->set, 0); last_set = 0; } if (last_set) msecs = nf_jiffies64_to_msecs(jiffies - last_jiffies); else msecs = 0; if (nla_put_be32(skb, NFTA_LAST_SET, htonl(last_set)) || nla_put_be64(skb, NFTA_LAST_MSECS, msecs, NFTA_LAST_PAD)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_last_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_last_priv *priv = nft_expr_priv(expr); kfree(priv->last); } static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_last_priv *priv_dst = nft_expr_priv(dst); struct nft_last_priv *priv_src = nft_expr_priv(src); priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp); if (!priv_dst->last) return -ENOMEM; priv_dst->last->set = priv_src->last->set; priv_dst->last->jiffies = priv_src->last->jiffies; return 0; } static const struct nft_expr_ops nft_last_ops = { .type = &nft_last_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_last_priv)), .eval = nft_last_eval, .init = nft_last_init, .destroy = nft_last_destroy, .clone = nft_last_clone, .dump = nft_last_dump, .reduce = NFT_REDUCE_READONLY, }; struct nft_expr_type nft_last_type __read_mostly = { .name = "last", .ops = &nft_last_ops, .policy = nft_last_policy, .maxattr = NFTA_LAST_MAX, .flags = NFT_EXPR_STATEFUL, .owner = THIS_MODULE, }; |
| 16 4 12 12 1 7 3 2 2 1 1 1 4 4 2 5 1 4 3 3 3 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 | // SPDX-License-Identifier: GPL-2.0-only /* * * Generic part shared by ipv4 and ipv6 backends. */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <linux/in.h> #include <net/xfrm.h> static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DIR] = { .type = NLA_U8 }, [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DREG] = { .type = NLA_U32 }, }; struct nft_xfrm { enum nft_xfrm_keys key:8; u8 dreg; u8 dir; u8 spnum; u8 len; }; static int nft_xfrm_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int len = 0; u32 spnum = 0; u8 dir; if (!tb[NFTA_XFRM_KEY] || !tb[NFTA_XFRM_DIR] || !tb[NFTA_XFRM_DREG]) return -EINVAL; switch (ctx->family) { case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: break; default: return -EOPNOTSUPP; } priv->key = ntohl(nla_get_be32(tb[NFTA_XFRM_KEY])); switch (priv->key) { case NFT_XFRM_KEY_REQID: case NFT_XFRM_KEY_SPI: len = sizeof(u32); break; case NFT_XFRM_KEY_DADDR_IP4: case NFT_XFRM_KEY_SADDR_IP4: len = sizeof(struct in_addr); break; case NFT_XFRM_KEY_DADDR_IP6: case NFT_XFRM_KEY_SADDR_IP6: len = sizeof(struct in6_addr); break; default: return -EINVAL; } dir = nla_get_u8(tb[NFTA_XFRM_DIR]); switch (dir) { case XFRM_POLICY_IN: case XFRM_POLICY_OUT: priv->dir = dir; break; default: return -EINVAL; } if (tb[NFTA_XFRM_SPNUM]) spnum = ntohl(nla_get_be32(tb[NFTA_XFRM_SPNUM])); if (spnum >= XFRM_MAX_DEPTH) return -ERANGE; priv->spnum = spnum; priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } /* Return true if key asks for daddr/saddr and current * state does have a valid address (BEET, TUNNEL). */ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) { switch (k) { case NFT_XFRM_KEY_DADDR_IP4: case NFT_XFRM_KEY_SADDR_IP4: if (family == NFPROTO_IPV4) break; return false; case NFT_XFRM_KEY_DADDR_IP6: case NFT_XFRM_KEY_SADDR_IP6: if (family == NFPROTO_IPV6) break; return false; default: return true; } return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_IPTFS; } static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, struct nft_regs *regs, const struct xfrm_state *state) { u32 *dest = ®s->data[priv->dreg]; if (!xfrm_state_addr_ok(priv->key, state->props.family, state->props.mode)) { regs->verdict.code = NFT_BREAK; return; } switch (priv->key) { case NFT_XFRM_KEY_UNSPEC: case __NFT_XFRM_KEY_MAX: WARN_ON_ONCE(1); break; case NFT_XFRM_KEY_DADDR_IP4: *dest = (__force __u32)state->id.daddr.a4; return; case NFT_XFRM_KEY_DADDR_IP6: memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr)); return; case NFT_XFRM_KEY_SADDR_IP4: *dest = (__force __u32)state->props.saddr.a4; return; case NFT_XFRM_KEY_SADDR_IP6: memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr)); return; case NFT_XFRM_KEY_REQID: *dest = state->props.reqid; return; case NFT_XFRM_KEY_SPI: *dest = (__force __u32)state->id.spi; return; } regs->verdict.code = NFT_BREAK; } static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct sec_path *sp = skb_sec_path(pkt->skb); const struct xfrm_state *state; if (sp == NULL || sp->len <= priv->spnum) { regs->verdict.code = NFT_BREAK; return; } state = sp->xvec[priv->spnum]; nft_xfrm_state_get_key(priv, regs, state); } static void nft_xfrm_get_eval_out(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct dst_entry *dst = skb_dst(pkt->skb); int i; for (i = 0; dst && dst->xfrm; dst = ((const struct xfrm_dst *)dst)->child, i++) { if (i < priv->spnum) continue; nft_xfrm_state_get_key(priv, regs, dst->xfrm); return; } regs->verdict.code = NFT_BREAK; } static void nft_xfrm_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_xfrm *priv = nft_expr_priv(expr); switch (priv->dir) { case XFRM_POLICY_IN: nft_xfrm_get_eval_in(priv, regs, pkt); break; case XFRM_POLICY_OUT: nft_xfrm_get_eval_out(priv, regs, pkt); break; default: WARN_ON_ONCE(1); regs->verdict.code = NFT_BREAK; break; } } static int nft_xfrm_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_xfrm *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_XFRM_DREG, priv->dreg)) return -1; if (nla_put_be32(skb, NFTA_XFRM_KEY, htonl(priv->key))) return -1; if (nla_put_u8(skb, NFTA_XFRM_DIR, priv->dir)) return -1; if (nla_put_be32(skb, NFTA_XFRM_SPNUM, htonl(priv->spnum))) return -1; return 0; } static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int hooks; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; switch (priv->dir) { case XFRM_POLICY_IN: hooks = (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_PRE_ROUTING); break; case XFRM_POLICY_OUT: hooks = (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING); break; default: WARN_ON_ONCE(1); return -EINVAL; } return nft_chain_validate_hooks(ctx->chain, hooks); } static bool nft_xfrm_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); const struct nft_xfrm *xfrm; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } xfrm = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != xfrm->key || priv->dreg != xfrm->dreg || priv->dir != xfrm->dir || priv->spnum != xfrm->spnum) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static struct nft_expr_type nft_xfrm_type; static const struct nft_expr_ops nft_xfrm_get_ops = { .type = &nft_xfrm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_xfrm)), .eval = nft_xfrm_get_eval, .init = nft_xfrm_get_init, .dump = nft_xfrm_get_dump, .validate = nft_xfrm_validate, .reduce = nft_xfrm_reduce, }; static struct nft_expr_type nft_xfrm_type __read_mostly = { .name = "xfrm", .ops = &nft_xfrm_get_ops, .policy = nft_xfrm_policy, .maxattr = NFTA_XFRM_MAX, .owner = THIS_MODULE, }; static int __init nft_xfrm_module_init(void) { return nft_register_expr(&nft_xfrm_type); } static void __exit nft_xfrm_module_exit(void) { nft_unregister_expr(&nft_xfrm_type); } module_init(nft_xfrm_module_init); module_exit(nft_xfrm_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("nf_tables: xfrm/IPSec matching"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_AUTHOR("Máté Eckl <ecklm94@gmail.com>"); MODULE_ALIAS_NFT_EXPR("xfrm"); |
| 402 339 70 1 61 70 171 23 499 20 20 52 2 37 20 2 366 368 364 1 990 265 155 621 2 115 552 552 619 422 60 3 398 22 22 16 6 1 5 5 1 4 7 11 11 6 19 19 13 5 1 4 4 3 1 1 1 14 604 605 599 18 581 19 1 10 10 7 1 15 5 19 19 4 16 1 3 1 1 3 5 13 1 19 3 17 1 15 6 4 6 4 6 8 2 27 1 26 3 1 16 17 1 1 4 1 9 1 143 1 1 1 1 136 8 25 7 1 16 11 4 5 13 1 32 17 14 46 45 11 1 134 1 123 10 1 4 1 80 49 24 2 21 1 1 1 119 12 106 1 39 32 31 1 4 4 1 2 31 1 32 32 2 14 11 14 1 29 1 50 49 30 1 3 1 1 1 38 39 1 40 40 50 279 160 139 158 154 3 2 151 7 17 141 1 142 3 140 5 3 1 1 138 1 4 5 133 2 132 116 23 23 10 1 1 8 1 3 1 23 23 9 132 1 131 1 112 28 1 3 1 1 3 5 1 1 3 3 710 517 367 156 33 132 30 1680 322 1683 861 1451 359 515 420 73 130 338 38 38 38 38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: FIB frontend. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> #include <net/lwtunnel.h> #include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; main_table = fib_trie_table(RT_TABLE_MAIN, NULL); if (!main_table) return -ENOMEM; local_table = fib_trie_table(RT_TABLE_LOCAL, main_table); if (!local_table) goto fail; hlist_add_head_rcu(&local_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]); hlist_add_head_rcu(&main_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]); return 0; fail: fib_free_table(main_table); return -ENOMEM; } #else struct fib_table *fib_new_table(struct net *net, u32 id) { struct fib_table *tb, *alias = NULL; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; tb = fib_get_table(net, id); if (tb) return tb; if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules) alias = fib_new_table(net, RT_TABLE_MAIN); tb = fib_trie_table(id, alias); if (!tb) return NULL; switch (id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, tb); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, tb); break; default: break; } h = id & (FIB_TABLE_HASHSZ - 1); hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; } EXPORT_SYMBOL_GPL(fib_new_table); /* caller must hold either rtnl or rcu read lock */ struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; h = id & (FIB_TABLE_HASHSZ - 1); head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) { if (tb->tb_id == id) return tb; } return NULL; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, struct fib_table *new) { #ifdef CONFIG_IP_MULTIPLE_TABLES switch (new->tb_id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, new); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, new); break; default: break; } #endif /* replace the old table in the hlist */ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist); } int fib_unmerge(struct net *net) { struct fib_table *old, *new, *main_table; /* attempt to fetch local table if it has been allocated */ old = fib_get_table(net, RT_TABLE_LOCAL); if (!old) return 0; new = fib_trie_unmerge(old); if (!new) return -ENOMEM; /* table is already unmerged */ if (new == old) return 0; /* replace merged table with clean table */ fib_replace_table(net, old, new); fib_free_table(old); /* attempt to fetch main table if it has been allocated */ main_table = fib_get_table(net, RT_TABLE_MAIN); if (!main_table) return 0; /* flush local entries from main table */ fib_table_flush_external(main_table); return 0; } void fib_flush(struct net *net) { int flushed = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) flushed += fib_table_flush(net, tb, false); } if (flushed) rt_cache_flush(net); } /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr, u32 tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned int ret = RTN_BROADCAST; struct fib_table *table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; if (ipv4_is_multicast(addr)) return RTN_MULTICAST; rcu_read_lock(); table = fib_get_table(net, tb_id); if (table) { ret = RTN_UNICAST; if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0); if (!dev || dev == nhc->nhc_dev) ret = res.type; } } rcu_read_unlock(); return ret; } unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id) { return __inet_dev_addr_type(net, NULL, addr, tb_id); } EXPORT_SYMBOL(inet_addr_type_table); unsigned int inet_addr_type(struct net *net, __be32 addr) { return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); /* inet_addr_type with dev == NULL but using the table from a dev * if one is associated */ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, NULL, addr, rt_table); } EXPORT_SYMBOL(inet_addr_type_dev_table); __be32 fib_compute_spec_dst(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct rtable *rt; struct net *net; int scope; rt = skb_rtable(skb); if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == RTCF_LOCAL) return ip_hdr(skb)->daddr; in_dev = __in_dev_get_rcu(dev); net = dev_net(dev); scope = RT_SCOPE_UNIVERSE; if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))), .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; if (!fib_lookup(net, &fl4, &res, 0)) return fib_result_prefsrc(net, &res); } else { scope = RT_SCOPE_LINK; } return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) { bool dev_match = false; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (unlikely(fi->nh)) { dev_match = nexthop_uses_dev(fi->nh, dev); } else { int ret; for (ret = 0; ret < fib_info_num_path(fi); ret++) { const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); if (nhc_l3mdev_matches_dev(nhc, dev)) { dev_match = true; break; } } } #else if (fib_info_nhc(fi, 0)->nhc_dev == dev) dev_match = true; #endif return dev_match; } EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev); /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. * - figure out what "logical" interface this packet arrived * and calculate "specific destination" address. * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, int rpf, struct in_device *idev, u32 *itag) { struct net *net = dev_net(dev); enum skb_drop_reason reason; struct flow_keys flkeys; int ret, no_addr; struct fib_result res; struct flowi4 fl4; bool dev_match; fl4.flowi4_oif = 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev); fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; no_addr = idev->ifa_list == NULL; fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } else { swap(fl4.fl4_sport, fl4.fl4_dport); } if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto e_inval; } else if (!IN_DEV_ACCEPT_LOCAL(idev)) { reason = SKB_DROP_REASON_IP_LOCAL_SOURCE; goto e_inval; } } fib_combine_itag(itag, &res); dev_match = fib_info_nh_uses_dev(res.fi, dev); /* This is not common, loopback packets retain skb_dst so normally they * would not even hit this slow path. */ dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; return ret; } if (no_addr) goto last_resort; if (rpf == 1) goto e_rpf; fl4.flowi4_oif = dev->ifindex; ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; } return ret; last_resort: if (rpf) goto e_rpf; *itag = 0; return 0; e_inval: return -reason; e_rpf: return -SKB_DROP_REASON_IP_RPFILTER; } /* Ignore rp_filter for packets protected by IPsec. */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); struct net *net = dev_net(dev); if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { if (IN_DEV_ACCEPT_LOCAL(idev)) goto ok; /* with custom local routes in place, checking local addresses * only will be too optimistic, with custom rules, checking * local addresses only can be too strict, e.g. due to vrf */ if (net->ipv4.fib_has_custom_local_routes || fib4_has_custom_rules(net)) goto full_check; /* Within the same container, it is regarded as a martian source, * and the same host but different containers are not. */ if (inet_lookup_ifaddr_rcu(net, src)) return -SKB_DROP_REASON_IP_LOCAL_SOURCE; ok: *itag = 0; return 0; } full_check: return __fib_validate_source(skb, src, dst, dscp, oif, dev, r, idev, itag); } static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; } static int put_rtax(struct nlattr *mx, int len, int type, u32 value) { struct nlattr *nla; nla = (struct nlattr *) ((char *) mx + len); nla->nla_type = type; nla->nla_len = nla_attr_size(4); *(u32 *) nla_data(nla) = value; return len + nla_total_size(4); } static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, struct fib_config *cfg) { __be32 addr; int plen; memset(cfg, 0, sizeof(*cfg)); cfg->fc_nlinfo.nl_net = net; if (rt->rt_dst.sa_family != AF_INET) return -EAFNOSUPPORT; /* * Check mask for validity: * a) it must be contiguous. * b) destination must have all host bits clear. * c) if application forgot to set correct family (AF_INET), * reject request unless it is absolutely clear i.e. * both family and mask are zero. */ plen = 32; addr = sk_extract_addr(&rt->rt_dst); if (!(rt->rt_flags & RTF_HOST)) { __be32 mask = sk_extract_addr(&rt->rt_genmask); if (rt->rt_genmask.sa_family != AF_INET) { if (mask || rt->rt_genmask.sa_family) return -EAFNOSUPPORT; } if (bad_mask(mask, addr)) return -EINVAL; plen = inet_mask_len(mask); } cfg->fc_dst_len = plen; cfg->fc_dst = addr; if (cmd != SIOCDELRT) { cfg->fc_nlflags = NLM_F_CREATE; cfg->fc_protocol = RTPROT_BOOT; } if (rt->rt_metric) cfg->fc_priority = rt->rt_metric - 1; if (rt->rt_flags & RTF_REJECT) { cfg->fc_scope = RT_SCOPE_HOST; cfg->fc_type = RTN_UNREACHABLE; return 0; } cfg->fc_scope = RT_SCOPE_NOWHERE; cfg->fc_type = RTN_UNICAST; if (rt->rt_dev) { char *colon; struct net_device *dev; char devname[IFNAMSIZ]; if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1)) return -EFAULT; devname[IFNAMSIZ-1] = 0; colon = strchr(devname, ':'); if (colon) *colon = 0; dev = __dev_get_by_name(net, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; cfg->fc_table = l3mdev_fib_table(dev); if (colon) { const struct in_ifaddr *ifa; struct in_device *in_dev; in_dev = __in_dev_get_rtnl_net(dev); if (!in_dev) return -ENODEV; *colon = ':'; in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; } } addr = sk_extract_addr(&rt->rt_gateway); if (rt->rt_gateway.sa_family == AF_INET && addr) { unsigned int addr_type; cfg->fc_gw4 = addr; cfg->fc_gw_family = AF_INET; addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && addr_type == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; if (cmd == SIOCDELRT) return 0; if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family) return -EINVAL; if (cfg->fc_scope == RT_SCOPE_NOWHERE) cfg->fc_scope = RT_SCOPE_LINK; if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) { struct nlattr *mx; int len = 0; mx = kcalloc(3, nla_total_size(4), GFP_KERNEL); if (!mx) return -ENOMEM; if (rt->rt_flags & RTF_MTU) len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40); if (rt->rt_flags & RTF_WINDOW) len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window); if (rt->rt_flags & RTF_IRTT) len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3); cfg->fc_mx = mx; cfg->fc_mx_len = len; } return 0; } /* * Handle IP routing ioctl calls. * These are used to manipulate the routing tables */ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) { struct fib_config cfg; int err; switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_net_lock(net); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) err = fib_table_delete(net, tb, &cfg, NULL); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) err = fib_table_insert(net, tb, &cfg, NULL); else err = -ENOBUFS; } /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } rtnl_net_unlock(net); return err; } return -EINVAL; } const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_GATEWAY] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_PREFSRC] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, }; int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, struct netlink_ext_ack *extack) { struct rtvia *via; int alen; if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA"); return -EINVAL; } via = nla_data(nla); alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); switch (via->rtvia_family) { case AF_INET: if (alen != sizeof(__be32)) { NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET; cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) if (alen != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET6; cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr); #else NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); return -EINVAL; #endif break; default: NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA"); return -EINVAL; } return 0; } static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_config *cfg, struct netlink_ext_ack *extack) { bool has_gw = false, has_via = false; struct nlattr *attr; int err, remaining; struct rtmsg *rtm; err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) goto errout; memset(cfg, 0, sizeof(*cfg)); rtm = nlmsg_data(nlh); if (!inet_validate_dscp(rtm->rtm_tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); err = -EINVAL; goto errout; } cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos); cfg->fc_dst_len = rtm->rtm_dst_len; cfg->fc_table = rtm->rtm_table; cfg->fc_protocol = rtm->rtm_protocol; cfg->fc_scope = rtm->rtm_scope; cfg->fc_type = rtm->rtm_type; cfg->fc_flags = rtm->rtm_flags; cfg->fc_nlflags = nlh->nlmsg_flags; cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); err = -EINVAL; goto errout; } nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { switch (nla_type(attr)) { case RTA_DST: cfg->fc_dst = nla_get_be32(attr); break; case RTA_OIF: cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: has_gw = true; cfg->fc_gw4 = nla_get_be32(attr); if (cfg->fc_gw4) cfg->fc_gw_family = AF_INET; break; case RTA_VIA: has_via = true; err = fib_gw_from_via(cfg, attr, extack); if (err) goto errout; break; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; case RTA_PREFSRC: cfg->fc_prefsrc = nla_get_be32(attr); break; case RTA_METRICS: cfg->fc_mx = nla_data(attr); cfg->fc_mx_len = nla_len(attr); break; case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), extack, false); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); cfg->fc_mp_len = nla_len(attr); break; case RTA_FLOW: cfg->fc_flow = nla_get_u32(attr); break; case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; case RTA_ENCAP: cfg->fc_encap = attr; break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack, false); if (err < 0) goto errout; break; case RTA_NH_ID: cfg->fc_nh_id = nla_get_u32(attr); break; } } if (cfg->fc_dst_len > 32) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); err = -EINVAL; goto errout; } if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); err = -EINVAL; goto errout; } if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); err = -EINVAL; goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); err = -EINVAL; goto errout; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; return 0; errout: return err; } static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; rtnl_net_lock(net); if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; goto unlock; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; goto unlock; } err = fib_table_delete(net, tb, &cfg, extack); unlock: rtnl_net_unlock(net); errout: return err; } static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; rtnl_net_lock(net); tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; goto unlock; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; unlock: rtnl_net_unlock(net); errout: return err; } int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[RTA_MAX + 1]; struct rtmsg *rtm; int err, i; if (filter->rtnl_held) ASSERT_RTNL(); if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); return -EINVAL; } rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_scope) { NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & RTM_F_CLONED) filter->dump_routes = false; else filter->dump_exceptions = false; filter->flags = rtm->rtm_flags; filter->protocol = rtm->rtm_protocol; filter->rt_type = rtm->rtm_type; filter->table_id = rtm->rtm_table; err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= RTA_MAX; ++i) { int ifindex; if (!tb[i]) continue; switch (i) { case RTA_TABLE: filter->table_id = nla_get_u32(tb[i]); break; case RTA_OIF: ifindex = nla_get_u32(tb[i]); if (filter->rtnl_held) filter->dev = __dev_get_by_index(net, ifindex); else filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; } } if (filter->flags || filter->protocol || filter->rt_type || filter->table_id || filter->dev) { filter->filter_set = 1; cb->answer_flags = NLM_F_DUMP_FILTERED; } return 0; } EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = { .dump_routes = true, .dump_exceptions = true, .rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; int dumped = 0, err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED); } /* ipv4 does not use prefix flag */ if (filter.flags & RTM_F_PREFIX) goto unlock; if (filter.table_id) { tb = fib_get_table(net, filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET) goto unlock; NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); err = -ENOENT; goto unlock; } err = fib_table_dump(tb, skb, cb, &filter); goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; err = 0; for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); err = fib_table_dump(tb, skb, cb, &filter); if (err < 0) goto out; dumped = 1; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); return err; } /* Prepare and feed intra-kernel routing request. * Really, it should be netlink message, but :-( netlink * can be not configured, so that we feed it directly * to fib engine. It is legal, because all events occur * only when netlink is already locked. */ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa, u32 rt_priority) { struct net *net = dev_net(ifa->ifa_dev->dev); u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, .fc_type = type, .fc_dst = dst, .fc_dst_len = dst_len, .fc_priority = rt_priority, .fc_prefsrc = ifa->ifa_local, .fc_oif = ifa->ifa_dev->dev->ifindex, .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, .fc_nlinfo = { .nl_net = net, }, }; if (!tb_id) tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; tb = fib_new_table(net, tb_id); if (!tb) return; cfg.fc_table = tb->tb_id; if (type != RTN_LOCAL) cfg.fc_scope = RT_SCOPE_LINK; else cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) fib_table_insert(net, tb, &cfg, NULL); else fib_table_delete(net, tb, &cfg, NULL); } void fib_add_ifaddr(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *prim = ifa; __be32 mask = ifa->ifa_mask; __be32 addr = ifa->ifa_local; __be32 prefix = ifa->ifa_address & mask; if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); if (!prim) { pr_warn("%s: bug: prim == NULL\n", __func__); return; } } fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0); if (!(dev->flags & IFF_UP)) return; /* Add broadcast address, if it is explicitly assigned. */ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); arp_invalidate(dev, ifa->ifa_broadcast, false); } if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority); /* Add the network broadcast address, when it makes sense */ if (ifa->ifa_prefixlen < 31) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); arp_invalidate(dev, prefix | ~mask, false); } } } void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric) { __be32 prefix = ifa->ifa_address & ifa->ifa_mask; struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; if (!(dev->flags & IFF_UP) || ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) || ipv4_is_zeronet(prefix) || (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32)) return; /* add the new */ fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, new_metric); /* delete the old */ fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority); } /* Delete primary or secondary address. * Optionally, on secondary address promotion consider the addresses * from subnet iprim as deleted, even if they are in device list. * In this case the secondary ifa can be in device list. */ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 #define BRD_OK 2 #define BRD0_OK 4 #define BRD1_OK 8 unsigned int ok = 0; int subnet = 0; /* Primary network */ int gone = 1; /* Address is missing */ int same_prefsrc = 0; /* Another primary with same IP */ if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (!prim) { /* if the device has been deleted, we don't perform * address promotion */ if (!in_dev->dead) pr_warn("%s: bug: prim == NULL\n", __func__); return; } if (iprim && iprim != prim) { pr_warn("%s: bug: iprim != prim\n", __func__); return; } } else if (!ipv4_is_zeronet(any) && (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, any, ifa->ifa_prefixlen, prim, 0); subnet = 1; } if (in_dev->dead) goto no_promotions; /* Deletion is more complicated than add. * We should take care of not to delete too much :-) * * Scan address list to be sure that addresses are really gone. */ rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa1, in_dev) { if (ifa1 == ifa) { /* promotion, keep the IP */ gone = 0; continue; } /* Ignore IFAs from our subnet */ if (iprim && ifa1->ifa_mask == iprim->ifa_mask && inet_ifa_match(ifa1->ifa_address, iprim)) continue; /* Ignore ifa1 if it uses different primary IP (prefsrc) */ if (ifa1->ifa_flags & IFA_F_SECONDARY) { /* Another address from our subnet? */ if (ifa1->ifa_mask == prim->ifa_mask && inet_ifa_match(ifa1->ifa_address, prim)) prim1 = prim; else { /* We reached the secondaries, so * same_prefsrc should be determined. */ if (!same_prefsrc) continue; /* Search new prim1 if ifa1 is not * using the current prim1 */ if (!prim1 || ifa1->ifa_mask != prim1->ifa_mask || !inet_ifa_match(ifa1->ifa_address, prim1)) prim1 = inet_ifa_byprefix(in_dev, ifa1->ifa_address, ifa1->ifa_mask); if (!prim1) continue; if (prim1->ifa_local != prim->ifa_local) continue; } } else { if (prim->ifa_local != ifa1->ifa_local) continue; prim1 = ifa1; if (prim != prim1) same_prefsrc = 1; } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) ok |= BRD_OK; if (brd == ifa1->ifa_broadcast) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; /* primary has network specific broadcasts */ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; if (!ipv4_is_zeronet(any1)) { if (ifa->ifa_broadcast == brd1 || ifa->ifa_broadcast == any1) ok |= BRD_OK; if (brd == brd1 || brd == any1) ok |= BRD1_OK; if (any == brd1 || any == any1) ok |= BRD0_OK; } } } rcu_read_unlock(); no_promotions: if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); if (subnet && ifa->ifa_prefixlen < 31) { if (!(ok & BRD1_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim, 0); if (!(ok & BRD0_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim, 0); } if (!(ok & LOCAL_OK)) { unsigned int addr_type; fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0); /* Check, that this local address finally disappeared. */ addr_type = inet_addr_type_dev_table(dev_net(dev), dev, ifa->ifa_local); if (gone && addr_type != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * * First of all, we scan fib_info list searching * for stray nexthop entries, then ignite fib_flush. */ if (fib_sync_down_addr(dev, ifa->ifa_local)) fib_flush(dev_net(dev)); } } #undef LOCAL_OK #undef BRD_OK #undef BRD0_OK #undef BRD1_OK } static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) { struct fib_result res; struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, .flowi4_tos = frn->fl_tos & INET_DSCP_MASK, .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; rcu_read_lock(); tb = fib_get_table(net, frn->tb_id_in); frn->err = -ENOENT; if (tb) { local_bh_disable(); frn->tb_id = tb->tb_id; frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; frn->nh_sel = res.nh_sel; frn->type = res.type; frn->scope = res.scope; } local_bh_enable(); } rcu_read_unlock(); } static void nl_fib_input(struct sk_buff *skb) { struct net *net; struct fib_result_nl *frn; struct nlmsghdr *nlh; u32 portid; net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); if (skb->len < nlmsg_total_size(sizeof(*frn)) || skb->len < nlh->nlmsg_len || nlmsg_len(nlh) < sizeof(*frn)) return; skb = netlink_skb_clone(skb, GFP_KERNEL); if (!skb) return; nlh = nlmsg_hdr(skb); frn = nlmsg_data(nlh); nl_fib_lookup(net, frn); portid = NETLINK_CB(skb).portid; /* netlink portid */ NETLINK_CB(skb).portid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ nlmsg_unicast(net->ipv4.fibnl, skb, portid); } static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .input = nl_fib_input, }; sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); if (!sk) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; return 0; } static void nl_fib_lookup_exit(struct net *net) { netlink_kernel_release(net->ipv4.fibnl); net->ipv4.fibnl = NULL; } static void fib_disable_ip(struct net_device *dev, unsigned long event, bool force) { if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); else rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); atomic_inc(&net->ipv4.dev_addr_genid); if (!ifa->ifa_dev->ifa_list) { /* Last address was deleted from this interface. * Disable IP. */ fib_disable_ip(dev, event, true); } else { rt_cache_flush(net); } break; } return NOTIFY_DONE; } static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_changeupper_info *upper_info = ptr; struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); struct in_ifaddr *ifa; unsigned int flags; if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, event, true); rt_flush_dev(dev); return NOTIFY_DONE; } in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UP: in_dev_for_each_ifa_rtnl(ifa, in_dev) { fib_add_ifaddr(ifa); } #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: fib_disable_ip(dev, event, false); break; case NETDEV_CHANGE: flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) fib_sync_up(dev, RTNH_F_LINKDOWN); else fib_sync_down_dev(dev, event, false); rt_cache_flush(net); break; case NETDEV_CHANGEMTU: fib_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(net); break; case NETDEV_CHANGEUPPER: upper_info = ptr; /* flush all routes if dev is linked to or unlinked from * an L3 master device (e.g., VRF) */ if (upper_info->upper_dev && netif_is_l3_master(upper_info->upper_dev)) fib_disable_ip(dev, NETDEV_DOWN, true); break; } return NOTIFY_DONE; } static struct notifier_block fib_inetaddr_notifier = { .notifier_call = fib_inetaddr_event, }; static struct notifier_block fib_netdev_notifier = { .notifier_call = fib_netdev_event, }; static int __net_init ip_fib_net_init(struct net *net) { int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; err = fib4_notifier_init(net); if (err) return err; #ifdef CONFIG_IP_ROUTE_MULTIPATH /* Default to 3-tuple */ net->ipv4.sysctl_fib_multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; #endif /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv4.fib_table_hash) { err = -ENOMEM; goto err_table_hash_alloc; } err = fib4_rules_init(net); if (err < 0) goto err_rules_init; return 0; err_rules_init: kfree(net->ipv4.fib_table_hash); err_table_hash_alloc: fib4_notifier_exit(net); return err; } static void ip_fib_net_exit(struct net *net) { int i; ASSERT_RTNL_NET(net); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif /* Destroy the tables in reverse order to guarantee that the * local table, ID 255, is destroyed before the main table, ID * 254. This is necessary as the local table may contain * references to data contained in the main table. */ for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) { struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); fib_table_flush(net, tb, true); fib_free_table(tb); } } #ifdef CONFIG_IP_MULTIPLE_TABLES fib4_rules_exit(net); #endif kfree(net->ipv4.fib_table_hash); fib4_notifier_exit(net); } static int __net_init fib_net_init(struct net *net) { int error; #ifdef CONFIG_IP_ROUTE_CLASSID atomic_set(&net->ipv4.fib_num_tclassid_users, 0); #endif error = ip_fib_net_init(net); if (error < 0) goto out; error = fib4_semantics_init(net); if (error) goto out_semantics; error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; error = fib_proc_init(net); if (error < 0) goto out_proc; out: return error; out_proc: nl_fib_lookup_exit(net); out_nlfl: fib4_semantics_exit(net); out_semantics: rtnl_net_lock(net); ip_fib_net_exit(net); rtnl_net_unlock(net); goto out; } static void __net_exit fib_net_exit(struct net *net) { fib_proc_exit(net); nl_fib_lookup_exit(net); } static void __net_exit fib_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { __rtnl_net_lock(net); ip_fib_net_exit(net); __rtnl_net_unlock(net); } rtnl_unlock(); list_for_each_entry(net, net_list, exit_list) fib4_semantics_exit(net); } static struct pernet_operations fib_net_ops = { .init = fib_net_init, .exit = fib_net_exit, .exit_batch = fib_net_exit_batch, }; static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWROUTE, .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELROUTE, .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; void __init ip_fib_init(void) { fib_trie_init(); register_pernet_subsys(&fib_net_ops); register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); rtnl_register_many(fib_rtnl_msg_handlers); } |
| 417 22 56 34 34 55 55 56 56 56 50 8 1 3 2 45 4 1 49 23 3 13 33 28 1 3 1 9 9 2 1 25 25 25 25 24 9 21 21 21 21 21 21 21 6 6 6 8 8 16 16 3 20 20 2 12 1 11 33 27 25 27 4 1 4 27 13 20 19 10 13 13 13 19 7 18 1 1 1 17 1 17 5 13 11 12 1 15 1 15 5 16 16 13 9 4 2 8 7 3 9 55 43 15 55 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 | /* inflate.c -- zlib decompression * Copyright (C) 1995-2005 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Based on zlib 1.2.3 but modified for the Linux Kernel by * Richard Purdie <richard@openedhand.com> * * Changes mainly for static instead of dynamic memory allocation * */ #include <linux/zutil.h> #include "inftrees.h" #include "inflate.h" #include "inffast.h" #include "infutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_inflate.h" #else #define INFLATE_RESET_HOOK(strm) do {} while (0) #define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0) #define INFLATE_NEED_UPDATEWINDOW(strm) 1 #define INFLATE_NEED_CHECKSUM(strm) 1 #endif int zlib_inflate_workspacesize(void) { return sizeof(struct inflate_workspace); } int zlib_inflateReset(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; strm->total_in = strm->total_out = state->total = 0; strm->msg = NULL; strm->adler = 1; /* to support ill-conceived Java test suite */ state->mode = HEAD; state->last = 0; state->havedict = 0; state->dmax = 32768U; state->hold = 0; state->bits = 0; state->lencode = state->distcode = state->next = state->codes; /* Initialise Window */ state->wsize = 1U << state->wbits; state->write = 0; state->whave = 0; INFLATE_RESET_HOOK(strm); return Z_OK; } int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; /* in case we return an error */ state = &WS(strm)->inflate_state; strm->state = (struct internal_state *)state; if (windowBits < 0) { state->wrap = 0; windowBits = -windowBits; } else { state->wrap = (windowBits >> 4) + 1; } if (windowBits < 8 || windowBits > 15) { return Z_STREAM_ERROR; } state->wbits = (unsigned)windowBits; #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE); #else state->window = &WS(strm)->working_window[0]; #endif return zlib_inflateReset(strm); } /* Return state with length and distance decoding tables and index sizes set to fixed code decoding. This returns fixed tables from inffixed.h. */ static void zlib_fixedtables(struct inflate_state *state) { # include "inffixed.h" state->lencode = lenfix; state->lenbits = 9; state->distcode = distfix; state->distbits = 5; } /* Update the window with the last wsize (normally 32K) bytes written before returning. This is only called when a window is already in use, or when output has been written during this inflate call, but the end of the deflate stream has not been reached yet. It is also called to window dictionary data when a dictionary is loaded. Providing output buffers larger than 32K to inflate() should provide a speed advantage, since only the last 32K of output is copied to the sliding window upon return from inflate(), and since all distances after the first 32K of output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ static void zlib_updatewindow(z_streamp strm, unsigned out) { struct inflate_state *state; unsigned copy, dist; state = (struct inflate_state *)strm->state; /* copy state->wsize or less output bytes into the circular window */ copy = out - strm->avail_out; if (copy >= state->wsize) { memcpy(state->window, strm->next_out - state->wsize, state->wsize); state->write = 0; state->whave = state->wsize; } else { dist = state->wsize - state->write; if (dist > copy) dist = copy; memcpy(state->window + state->write, strm->next_out - copy, dist); copy -= dist; if (copy) { memcpy(state->window, strm->next_out - copy, copy); state->write = copy; state->whave = state->wsize; } else { state->write += dist; if (state->write == state->wsize) state->write = 0; if (state->whave < state->wsize) state->whave += dist; } } } /* * At the end of a Deflate-compressed PPP packet, we expect to have seen * a `stored' block type value but not the (zero) length bytes. */ /* Returns true if inflate is currently at the end of a block generated by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored block. When decompressing, PPP checks that at the end of input packet, inflate is waiting for these length bytes. */ static int zlib_inflateSyncPacket(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == STORED && state->bits == 0) { state->mode = TYPE; return Z_OK; } return Z_DATA_ERROR; } /* Macros for inflate(): */ /* check function to use adler32() for zlib or crc32() for gzip */ #define UPDATE(check, buf, len) zlib_adler32(check, buf, len) /* Load registers with state in inflate() for speed */ #define LOAD() \ do { \ put = strm->next_out; \ left = strm->avail_out; \ next = strm->next_in; \ have = strm->avail_in; \ hold = state->hold; \ bits = state->bits; \ } while (0) /* Restore state from registers in inflate() */ #define RESTORE() \ do { \ strm->next_out = put; \ strm->avail_out = left; \ strm->next_in = next; \ strm->avail_in = have; \ state->hold = hold; \ state->bits = bits; \ } while (0) /* Clear the input bit accumulator */ #define INITBITS() \ do { \ hold = 0; \ bits = 0; \ } while (0) /* Get a byte of input into the bit accumulator, or return from inflate() if there is no input available. */ #define PULLBYTE() \ do { \ if (have == 0) goto inf_leave; \ have--; \ hold += (unsigned long)(*next++) << bits; \ bits += 8; \ } while (0) /* Assure that there are at least n bits in the bit accumulator. If there is not enough available input to do that, then return from inflate(). */ #define NEEDBITS(n) \ do { \ while (bits < (unsigned)(n)) \ PULLBYTE(); \ } while (0) /* Return the low n bits of the bit accumulator (n < 16) */ #define BITS(n) \ ((unsigned)hold & ((1U << (n)) - 1)) /* Remove n bits from the bit accumulator */ #define DROPBITS(n) \ do { \ hold >>= (n); \ bits -= (unsigned)(n); \ } while (0) /* Remove zero to seven bits as needed to go to a byte boundary */ #define BYTEBITS() \ do { \ hold >>= bits & 7; \ bits -= bits & 7; \ } while (0) /* inflate() uses a state machine to process as much input data and generate as much output data as possible before returning. The state machine is structured roughly as follows: for (;;) switch (state) { ... case STATEn: if (not enough input data or output space to make progress) return; ... make progress ... state = STATEm; break; ... } so when inflate() is called again, the same case is attempted again, and if the appropriate resources are provided, the machine proceeds to the next state. The NEEDBITS() macro is usually the way the state evaluates whether it can proceed or should return. NEEDBITS() does the return if the requested bits are not available. The typical use of the BITS macros is: NEEDBITS(n); ... do something with BITS(n) ... DROPBITS(n); where NEEDBITS(n) either returns from inflate() if there isn't enough input left to load n bits into the accumulator, or it continues. BITS(n) gives the low n bits in the accumulator. When done, DROPBITS(n) drops the low n bits off the accumulator. INITBITS() clears the accumulator and sets the number of available bits to zero. BYTEBITS() discards just enough bits to put the accumulator on a byte boundary. After BYTEBITS() and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return if there is no input available. The decoding of variable length codes uses PULLBYTE() directly in order to pull just enough bytes to decode the next code, and no more. Some states loop until they get enough input, making sure that enough state information is maintained to continue the loop where it left off if NEEDBITS() returns in the loop. For example, want, need, and keep would all have to actually be part of the saved state in case NEEDBITS() returns: case STATEw: while (want < need) { NEEDBITS(n); keep[want++] = BITS(n); DROPBITS(n); } state = STATEx; case STATEx: As shown above, if the next state is also the next case, then the break is omitted. A state may also return if there is not enough output space available to complete that state. Those states are copying stored data, writing a literal byte, and copying a matching string. When returning, a "goto inf_leave" is used to update the total counters, update the check value, and determine whether any progress has been made during that inflate() call in order to return the proper return code. Progress is defined as a change in either strm->avail_in or strm->avail_out. When there is a window, goto inf_leave will update the window with the last output written. If a goto inf_leave occurs in the middle of decompression and there is no window currently, goto inf_leave will create one and copy output to the window for the next call of inflate(). In this implementation, the flush parameter of inflate() only affects the return code (per zlib.h). inflate() always writes as much as possible to strm->next_out, given the space available and the provided input--the effect documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers the allocation of and copying into a sliding window until necessary, which provides the effect documented in zlib.h for Z_FINISH when the entire input stream available. So the only thing the flush parameter actually does is: when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it will return Z_BUF_ERROR if it has not reached the end of the stream. */ int zlib_inflate(z_streamp strm, int flush) { struct inflate_state *state; const unsigned char *next; /* next input */ unsigned char *put; /* next output */ unsigned have, left; /* available input and output */ unsigned long hold; /* bit buffer */ unsigned bits; /* bits in bit buffer */ unsigned in, out; /* save starting available input and output */ unsigned copy; /* number of stored or match bytes to copy */ unsigned char *from; /* where to copy match bytes from */ code this; /* current decoding table entry */ code last; /* parent table entry */ unsigned len; /* length to copy for repeats, bits to drop */ int ret; /* return code */ static const unsigned short order[19] = /* permutation of code lengths */ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; /* Do not check for strm->next_out == NULL here as ppc zImage inflates to strm->next_out = 0 */ if (strm == NULL || strm->state == NULL || (strm->next_in == NULL && strm->avail_in != 0)) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ LOAD(); in = have; out = left; ret = Z_OK; for (;;) switch (state->mode) { case HEAD: if (state->wrap == 0) { state->mode = TYPEDO; break; } NEEDBITS(16); if ( ((BITS(8) << 8) + (hold >> 8)) % 31) { strm->msg = (char *)"incorrect header check"; state->mode = BAD; break; } if (BITS(4) != Z_DEFLATED) { strm->msg = (char *)"unknown compression method"; state->mode = BAD; break; } DROPBITS(4); len = BITS(4) + 8; if (len > state->wbits) { strm->msg = (char *)"invalid window size"; state->mode = BAD; break; } state->dmax = 1U << len; strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); break; case DICTID: NEEDBITS(32); strm->adler = state->check = REVERSE(hold); INITBITS(); state->mode = DICT; fallthrough; case DICT: if (state->havedict == 0) { RESTORE(); return Z_NEED_DICT; } strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = TYPE; fallthrough; case TYPE: if (flush == Z_BLOCK) goto inf_leave; fallthrough; case TYPEDO: INFLATE_TYPEDO_HOOK(strm, flush); if (state->last) { BYTEBITS(); state->mode = CHECK; break; } NEEDBITS(3); state->last = BITS(1); DROPBITS(1); switch (BITS(2)) { case 0: /* stored block */ state->mode = STORED; break; case 1: /* fixed block */ zlib_fixedtables(state); state->mode = LEN; /* decode codes */ break; case 2: /* dynamic block */ state->mode = TABLE; break; case 3: strm->msg = (char *)"invalid block type"; state->mode = BAD; } DROPBITS(2); break; case STORED: BYTEBITS(); /* go to byte boundary */ NEEDBITS(32); if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { strm->msg = (char *)"invalid stored block lengths"; state->mode = BAD; break; } state->length = (unsigned)hold & 0xffff; INITBITS(); state->mode = COPY; fallthrough; case COPY: copy = state->length; if (copy) { if (copy > have) copy = have; if (copy > left) copy = left; if (copy == 0) goto inf_leave; memcpy(put, next, copy); have -= copy; next += copy; left -= copy; put += copy; state->length -= copy; break; } state->mode = TYPE; break; case TABLE: NEEDBITS(14); state->nlen = BITS(5) + 257; DROPBITS(5); state->ndist = BITS(5) + 1; DROPBITS(5); state->ncode = BITS(4) + 4; DROPBITS(4); #ifndef PKZIP_BUG_WORKAROUND if (state->nlen > 286 || state->ndist > 30) { strm->msg = (char *)"too many length or distance symbols"; state->mode = BAD; break; } #endif state->have = 0; state->mode = LENLENS; fallthrough; case LENLENS: while (state->have < state->ncode) { NEEDBITS(3); state->lens[order[state->have++]] = (unsigned short)BITS(3); DROPBITS(3); } while (state->have < 19) state->lens[order[state->have++]] = 0; state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 7; ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid code lengths set"; state->mode = BAD; break; } state->have = 0; state->mode = CODELENS; fallthrough; case CODELENS: while (state->have < state->nlen + state->ndist) { for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.val < 16) { NEEDBITS(this.bits); DROPBITS(this.bits); state->lens[state->have++] = this.val; } else { if (this.val == 16) { NEEDBITS(this.bits + 2); DROPBITS(this.bits); if (state->have == 0) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } len = state->lens[state->have - 1]; copy = 3 + BITS(2); DROPBITS(2); } else if (this.val == 17) { NEEDBITS(this.bits + 3); DROPBITS(this.bits); len = 0; copy = 3 + BITS(3); DROPBITS(3); } else { NEEDBITS(this.bits + 7); DROPBITS(this.bits); len = 0; copy = 11 + BITS(7); DROPBITS(7); } if (state->have + copy > state->nlen + state->ndist) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } while (copy--) state->lens[state->have++] = (unsigned short)len; } } /* handle error breaks in while */ if (state->mode == BAD) break; /* build code tables */ state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 9; ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid literal/lengths set"; state->mode = BAD; break; } state->distcode = (code const *)(state->next); state->distbits = 6; ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist, &(state->next), &(state->distbits), state->work); if (ret) { strm->msg = (char *)"invalid distances set"; state->mode = BAD; break; } state->mode = LEN; fallthrough; case LEN: if (have >= 6 && left >= 258) { RESTORE(); inflate_fast(strm, out); LOAD(); break; } for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.op && (this.op & 0xf0) == 0) { last = this; for (;;) { this = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); state->length = (unsigned)this.val; if ((int)(this.op) == 0) { state->mode = LIT; break; } if (this.op & 32) { state->mode = TYPE; break; } if (this.op & 64) { strm->msg = (char *)"invalid literal/length code"; state->mode = BAD; break; } state->extra = (unsigned)(this.op) & 15; state->mode = LENEXT; fallthrough; case LENEXT: if (state->extra) { NEEDBITS(state->extra); state->length += BITS(state->extra); DROPBITS(state->extra); } state->mode = DIST; fallthrough; case DIST: for (;;) { this = state->distcode[BITS(state->distbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if ((this.op & 0xf0) == 0) { last = this; for (;;) { this = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); if (this.op & 64) { strm->msg = (char *)"invalid distance code"; state->mode = BAD; break; } state->offset = (unsigned)this.val; state->extra = (unsigned)(this.op) & 15; state->mode = DISTEXT; fallthrough; case DISTEXT: if (state->extra) { NEEDBITS(state->extra); state->offset += BITS(state->extra); DROPBITS(state->extra); } #ifdef INFLATE_STRICT if (state->offset > state->dmax) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif if (state->offset > state->whave + out - left) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } state->mode = MATCH; fallthrough; case MATCH: if (left == 0) goto inf_leave; copy = out - left; if (state->offset > copy) { /* copy from window */ copy = state->offset - copy; if (copy > state->write) { copy -= state->write; from = state->window + (state->wsize - copy); } else from = state->window + (state->write - copy); if (copy > state->length) copy = state->length; } else { /* copy from output */ from = put - state->offset; copy = state->length; } if (copy > left) copy = left; left -= copy; state->length -= copy; do { *put++ = *from++; } while (--copy); if (state->length == 0) state->mode = LEN; break; case LIT: if (left == 0) goto inf_leave; *put++ = (unsigned char)(state->length); left--; state->mode = LEN; break; case CHECK: if (state->wrap) { NEEDBITS(32); out -= left; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && out) strm->adler = state->check = UPDATE(state->check, put - out, out); out = left; if (( REVERSE(hold)) != state->check) { strm->msg = (char *)"incorrect data check"; state->mode = BAD; break; } INITBITS(); } state->mode = DONE; fallthrough; case DONE: ret = Z_STREAM_END; goto inf_leave; case BAD: ret = Z_DATA_ERROR; goto inf_leave; case MEM: return Z_MEM_ERROR; case SYNC: default: return Z_STREAM_ERROR; } /* Return from inflate(), updating the total counts and the check value. If there was no progress during the inflate() call, return a buffer error. Call zlib_updatewindow() to create and/or update the window state. */ inf_leave: RESTORE(); if (INFLATE_NEED_UPDATEWINDOW(strm) && (state->wsize || (state->mode < CHECK && out != strm->avail_out))) zlib_updatewindow(strm, out); in -= strm->avail_in; out -= strm->avail_out; strm->total_in += in; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out) strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); strm->data_type = state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0); if (flush == Z_PACKET_FLUSH && ret == Z_OK && strm->avail_out != 0 && strm->avail_in == 0) return zlib_inflateSyncPacket(strm); if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) ret = Z_BUF_ERROR; return ret; } int zlib_inflateEnd(z_streamp strm) { if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; return Z_OK; } /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; * i.e. no pending output but this should always be the case. The state must * be waiting on the start of a block (i.e. mode == TYPE or HEAD). On exit, * the output will also be caught up, and the checksum will have been updated * if need be. */ int zlib_inflateIncomp(z_stream *z) { struct inflate_state *state = (struct inflate_state *)z->state; Byte *saved_no = z->next_out; uInt saved_ao = z->avail_out; if (state->mode != TYPE && state->mode != HEAD) return Z_DATA_ERROR; /* Setup some variables to allow misuse of updateWindow */ z->avail_out = 0; z->next_out = (unsigned char*)z->next_in + z->avail_in; zlib_updatewindow(z, z->avail_in); /* Restore saved variables */ z->avail_out = saved_ao; z->next_out = saved_no; z->adler = state->check = UPDATE(state->check, z->next_in, z->avail_in); z->total_out += z->avail_in; z->total_in += z->avail_in; z->next_in += z->avail_in; state->total += z->avail_in; z->avail_in = 0; return Z_OK; } |
| 23 368 491 391 393 392 370 430 18 431 430 431 429 106 392 393 4 1 2 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "super-io.h" #include "sb-counters.h" /* BCH_SB_FIELD_counters */ static const u8 counters_to_stable_map[] = { #define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, BCH_PERSISTENT_COUNTERS() #undef x }; const char * const bch2_counter_names[] = { #define x(t, n, ...) (#t), BCH_PERSISTENT_COUNTERS() #undef x NULL }; static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) { if (!ctrs) return 0; return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; } static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { return 0; } static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_counters *ctrs = field_to_type(f, counters); unsigned int nr = bch2_sb_counter_nr_entries(ctrs); for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { unsigned stable = counters_to_stable_map[i]; if (stable < nr) prt_printf(out, "%s \t%llu\n", bch2_counter_names[i], le64_to_cpu(ctrs->d[stable])); } } int bch2_sb_counters_to_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); unsigned int nr = bch2_sb_counter_nr_entries(ctrs); for (unsigned i = 0; i < BCH_COUNTER_NR; i++) c->counters_on_mount[i] = 0; for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { unsigned stable = counters_to_stable_map[i]; if (stable < nr) { u64 v = le64_to_cpu(ctrs->d[stable]); percpu_u64_set(&c->counters[i], v); c->counters_on_mount[i] = v; } } return 0; } int bch2_sb_counters_from_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); struct bch_sb_field_counters *ret; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); if (nr < BCH_COUNTER_NR) { ret = bch2_sb_field_resize(&c->disk_sb, counters, sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); if (ret) { ctrs = ret; nr = bch2_sb_counter_nr_entries(ctrs); } } for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { unsigned stable = counters_to_stable_map[i]; if (stable < nr) ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); } return 0; } void bch2_fs_counters_exit(struct bch_fs *c) { free_percpu(c->counters); } int bch2_fs_counters_init(struct bch_fs *c) { c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64)); if (!c->counters) return -BCH_ERR_ENOMEM_fs_counters_init; return bch2_sb_counters_to_cpu(c); } const struct bch_sb_field_ops bch_sb_field_ops_counters = { .validate = bch2_sb_counters_validate, .to_text = bch2_sb_counters_to_text, }; #ifndef NO_BCACHEFS_CHARDEV long bch2_ioctl_query_counters(struct bch_fs *c, struct bch_ioctl_query_counters __user *user_arg) { struct bch_ioctl_query_counters arg; int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); if (ret) return ret; if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || arg.pad) return -EINVAL; arg.nr = min(arg.nr, BCH_COUNTER_NR); ret = put_user(arg.nr, &user_arg->nr); if (ret) return ret; for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { unsigned stable = counters_to_stable_map[i]; if (stable < arg.nr) { u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) ? percpu_u64_get(&c->counters[i]) : c->counters_on_mount[i]; ret = put_user(v, &user_arg->d[stable]); if (ret) return ret; } } return 0; } #endif |
| 13598 18 8946 9444 8932 132 14 3851 15 158 38 17 3242 19 27 10 1 5 338 36 4 13 24 292 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* audit.h -- Auditing support * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> */ #ifndef _LINUX_AUDIT_H_ #define _LINUX_AUDIT_H_ #include <linux/sched.h> #include <linux/ptrace.h> #include <linux/audit_arch.h> #include <uapi/linux/audit.h> #include <uapi/linux/netfilter/nf_tables.h> #include <uapi/linux/fanotify.h> #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) struct audit_sig_info { uid_t uid; pid_t pid; char ctx[]; }; struct audit_buffer; struct audit_context; struct inode; struct netlink_skb_parms; struct path; struct linux_binprm; struct mq_attr; struct mqstat; struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; struct audit_krule { u32 pflags; u32 flags; u32 listnr; u32 action; u32 mask[AUDIT_BITMASK_SIZE]; u32 buflen; /* for data alloc on list rules */ u32 field_count; char *filterkey; /* ties events to rules */ struct audit_field *fields; struct audit_field *arch_f; /* quick access to arch field */ struct audit_field *inode_f; /* quick access to an inode field */ struct audit_watch *watch; /* associated watch */ struct audit_tree *tree; /* associated watched tree */ struct audit_fsnotify_mark *exe; struct list_head rlist; /* entry in audit_{watch,tree}.rules list */ struct list_head list; /* for AUDIT_LIST* purposes only */ u64 prio; }; /* Flag to indicate legacy AUDIT_LOGINUID unset usage */ #define AUDIT_LOGINUID_LEGACY 0x1 struct audit_field { u32 type; union { u32 val; kuid_t uid; kgid_t gid; struct { char *lsm_str; void *lsm_rule; }; }; u32 op; }; enum audit_ntp_type { AUDIT_NTP_OFFSET, AUDIT_NTP_FREQ, AUDIT_NTP_STATUS, AUDIT_NTP_TAI, AUDIT_NTP_TICK, AUDIT_NTP_ADJUST, AUDIT_NTP_NVALS /* count */ }; #ifdef CONFIG_AUDITSYSCALL struct audit_ntp_val { long long oldval, newval; }; struct audit_ntp_data { struct audit_ntp_val vals[AUDIT_NTP_NVALS]; }; #else struct audit_ntp_data {}; #endif enum audit_nfcfgop { AUDIT_XT_OP_REGISTER, AUDIT_XT_OP_REPLACE, AUDIT_XT_OP_UNREGISTER, AUDIT_NFT_OP_TABLE_REGISTER, AUDIT_NFT_OP_TABLE_UNREGISTER, AUDIT_NFT_OP_CHAIN_REGISTER, AUDIT_NFT_OP_CHAIN_UNREGISTER, AUDIT_NFT_OP_RULE_REGISTER, AUDIT_NFT_OP_RULE_UNREGISTER, AUDIT_NFT_OP_SET_REGISTER, AUDIT_NFT_OP_SET_UNREGISTER, AUDIT_NFT_OP_SETELEM_REGISTER, AUDIT_NFT_OP_SETELEM_UNREGISTER, AUDIT_NFT_OP_GEN_REGISTER, AUDIT_NFT_OP_OBJ_REGISTER, AUDIT_NFT_OP_OBJ_UNREGISTER, AUDIT_NFT_OP_OBJ_RESET, AUDIT_NFT_OP_FLOWTABLE_REGISTER, AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, AUDIT_NFT_OP_SETELEM_RESET, AUDIT_NFT_OP_RULE_RESET, AUDIT_NFT_OP_INVALID, }; extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); /* only for compat system calls */ extern unsigned compat_write_class[]; extern unsigned compat_read_class[]; extern unsigned compat_dir_class[]; extern unsigned compat_chattr_class[]; extern unsigned compat_signal_class[]; /* audit_names->type values */ #define AUDIT_TYPE_UNKNOWN 0 /* we don't know yet */ #define AUDIT_TYPE_NORMAL 1 /* a "normal" audit record */ #define AUDIT_TYPE_PARENT 2 /* a parent audit record */ #define AUDIT_TYPE_CHILD_DELETE 3 /* a child being deleted */ #define AUDIT_TYPE_CHILD_CREATE 4 /* a child being created */ /* maximized args number that audit_socketcall can process */ #define AUDITSC_ARGS 6 /* bit values for ->signal->audit_tty */ #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) struct filename; #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 #ifdef CONFIG_AUDIT /* These are defined in audit.c */ /* Public API */ extern __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...); extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type); extern __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...); extern void audit_log_end(struct audit_buffer *ab); extern bool audit_string_contains_control(const char *string, size_t len); extern void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len); extern void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n); extern void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n); extern void audit_log_untrustedstring(struct audit_buffer *ab, const char *string); extern void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path); extern void audit_log_key(struct audit_buffer *ab, char *key); extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); extern int audit_update_lsm_rules(void); /* Private API (for audit.c only) */ extern int audit_rule_change(int type, int seq, void *data, size_t datasz); extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); extern int audit_set_loginuid(kuid_t loginuid); static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return tsk->loginuid; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return tsk->sessionid; } extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { } static inline struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { return NULL; } static inline __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { } static inline void audit_log_end(struct audit_buffer *ab) { } static inline void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { } static inline void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n) { } static inline void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n) { } static inline void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { } static inline void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { } static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; } static inline void audit_log_task_info(struct audit_buffer *ab) { } static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return INVALID_UID; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return AUDIT_SID_UNSET; } #define audit_enabled AUDIT_OFF static inline int audit_signal_info(int sig, struct task_struct *t) { return 0; } #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC #define audit_is_compat(arch) (!((arch) & __AUDIT_ARCH_64BIT)) #else #define audit_is_compat(arch) false #endif #define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ #define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ #define AUDIT_INODE_NOEVAL 4 /* audit record incomplete */ #ifdef CONFIG_AUDITSYSCALL #include <asm/syscall.h> /* for syscall_get_arch() */ /* These are defined in auditsc.c */ /* Public API */ extern int audit_alloc(struct task_struct *task); extern void __audit_free(struct task_struct *task); extern void __audit_uring_entry(u8 op); extern void __audit_uring_exit(int success, long code); extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); extern void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type); extern void audit_seccomp(unsigned long syscall, long signr, int code); extern void audit_seccomp_actions_logged(const char *names, const char *old_names, int res); extern void __audit_ptrace(struct task_struct *t); static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { task->audit_context = ctx; } static inline struct audit_context *audit_context(void) { return current->audit_context; } static inline bool audit_dummy_context(void) { void *p = audit_context(); return !p || *(int *)p; } static inline void audit_free(struct task_struct *task) { if (unlikely(task->audit_context)) __audit_free(task); } static inline void audit_uring_entry(u8 op) { /* * We intentionally check audit_context() before audit_enabled as most * Linux systems (as of ~2021) rely on systemd which forces audit to * be enabled regardless of the user's audit configuration. */ if (unlikely(audit_context() && audit_enabled)) __audit_uring_entry(op); } static inline void audit_uring_exit(int success, long code) { if (unlikely(audit_context())) __audit_uring_exit(success, code); } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { if (unlikely(audit_context())) __audit_syscall_entry(major, a0, a1, a2, a3); } static inline void audit_syscall_exit(void *pt_regs) { if (unlikely(audit_context())) { int success = is_syscall_success(pt_regs); long return_code = regs_return_value(pt_regs); __audit_syscall_exit(success, return_code); } } static inline struct filename *audit_reusename(const __user char *name) { if (unlikely(!audit_dummy_context())) return __audit_reusename(name); return NULL; } static inline void audit_getname(struct filename *name) { if (unlikely(!audit_dummy_context())) __audit_getname(name); } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, aflags); } static inline void audit_file(struct file *file) { if (unlikely(!audit_dummy_context())) __audit_file(file); } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { if (unlikely(!audit_dummy_context())) __audit_inode_child(parent, dentry, type); } void audit_core_dumps(long signr); static inline void audit_ptrace(struct task_struct *t) { if (unlikely(!audit_dummy_context())) __audit_ptrace(t); } /* Private API (for audit.c only) */ extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); extern void __audit_bprm(struct linux_binprm *bprm); extern int __audit_socketcall(int nargs, unsigned long *args); extern int __audit_sockaddr(int len, void *addr); extern void __audit_fd_pair(int fd1, int fd2); extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr); extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout); extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification); extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old); extern void __audit_log_capset(const struct cred *new, const struct cred *old); extern void __audit_mmap_fd(int fd, int flags); extern void __audit_openat2_how(struct open_how *how); extern void __audit_log_kern_module(char *name); extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar); extern void __audit_tk_injoffset(struct timespec64 offset); extern void __audit_ntp_log(const struct audit_ntp_data *ad); extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp); static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { if (unlikely(!audit_dummy_context())) __audit_ipc_obj(ipcp); } static inline void audit_fd_pair(int fd1, int fd2) { if (unlikely(!audit_dummy_context())) __audit_fd_pair(fd1, fd2); } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } static inline void audit_bprm(struct linux_binprm *bprm) { if (unlikely(!audit_dummy_context())) __audit_bprm(bprm); } static inline int audit_socketcall(int nargs, unsigned long *args) { if (unlikely(!audit_dummy_context())) return __audit_socketcall(nargs, args); return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { unsigned long a[AUDITSC_ARGS]; int i; if (audit_dummy_context()) return 0; for (i = 0; i < nargs; i++) a[i] = (unsigned long)args[i]; return __audit_socketcall(nargs, a); } static inline int audit_sockaddr(int len, void *addr) { if (unlikely(!audit_dummy_context())) return __audit_sockaddr(len, addr); return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { if (unlikely(!audit_dummy_context())) __audit_mq_open(oflag, mode, attr); } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { if (unlikely(!audit_dummy_context())) __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout); } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { if (unlikely(!audit_dummy_context())) __audit_mq_notify(mqdes, notification); } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { if (unlikely(!audit_dummy_context())) __audit_mq_getsetattr(mqdes, mqstat); } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) return __audit_log_bprm_fcaps(bprm, new, old); return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) __audit_log_capset(new, old); } static inline void audit_mmap_fd(int fd, int flags) { if (unlikely(!audit_dummy_context())) __audit_mmap_fd(fd, flags); } static inline void audit_openat2_how(struct open_how *how) { if (unlikely(!audit_dummy_context())) __audit_openat2_how(how); } static inline void audit_log_kern_module(char *name) { if (!audit_dummy_context()) __audit_log_kern_module(name); } static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { if (!audit_dummy_context()) __audit_fanotify(response, friar); } static inline void audit_tk_injoffset(struct timespec64 offset) { /* ignore no-op events */ if (offset.tv_sec == 0 && offset.tv_nsec == 0) return; if (!audit_dummy_context()) __audit_tk_injoffset(offset); } static inline void audit_ntp_init(struct audit_ntp_data *ad) { memset(ad, 0, sizeof(*ad)); } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].oldval = val; } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].newval = val; } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { if (!audit_dummy_context()) __audit_ntp_log(ad); } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { if (audit_enabled) __audit_log_nfcfg(name, af, nentries, op, gfp); } extern int audit_n_rules; extern int audit_signals; #else /* CONFIG_AUDITSYSCALL */ static inline int audit_alloc(struct task_struct *task) { return 0; } static inline void audit_free(struct task_struct *task) { } static inline void audit_uring_entry(u8 op) { } static inline void audit_uring_exit(int success, long code) { } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { } static inline void audit_syscall_exit(void *pt_regs) { } static inline bool audit_dummy_context(void) { return true; } static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { } static inline struct audit_context *audit_context(void) { return NULL; } static inline struct filename *audit_reusename(const __user char *name) { return NULL; } static inline void audit_getname(struct filename *name) { } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { } static inline void audit_file(struct file *file) { } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { } static inline void audit_core_dumps(long signr) { } static inline void audit_seccomp(unsigned long syscall, long signr, int code) { } static inline void audit_seccomp_actions_logged(const char *names, const char *old_names, int res) { } static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { } static inline void audit_bprm(struct linux_binprm *bprm) { } static inline int audit_socketcall(int nargs, unsigned long *args) { return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { return 0; } static inline void audit_fd_pair(int fd1, int fd2) { } static inline int audit_sockaddr(int len, void *addr) { return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { } static inline void audit_mmap_fd(int fd, int flags) { } static inline void audit_openat2_how(struct open_how *how) { } static inline void audit_log_kern_module(char *name) { } static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { } static inline void audit_tk_injoffset(struct timespec64 offset) { } static inline void audit_ntp_init(struct audit_ntp_data *ad) { } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { } static inline void audit_ptrace(struct task_struct *t) { } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { } #define audit_n_rules 0 #define audit_signals 0 #endif /* CONFIG_AUDITSYSCALL */ static inline bool audit_loginuid_set(struct task_struct *tsk) { return uid_valid(audit_get_loginuid(tsk)); } #endif |
| 23 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | // SPDX-License-Identifier: GPL-2.0 /* * RTC related functions */ #include <linux/platform_device.h> #include <linux/mc146818rtc.h> #include <linux/export.h> #include <linux/pnp.h> #include <asm/vsyscall.h> #include <asm/x86_init.h> #include <asm/time.h> #include <asm/setup.h> #ifdef CONFIG_X86_32 /* * This is a special lock that is owned by the CPU and holds the index * register we are working with. It is required for NMI access to the * CMOS/RTC registers. See arch/x86/include/asm/mc146818rtc.h for details. */ volatile unsigned long cmos_lock; EXPORT_SYMBOL(cmos_lock); #endif /* CONFIG_X86_32 */ DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); /* * In order to set the CMOS clock precisely, mach_set_cmos_time has to be * called 500 ms after the second nowtime has started, because when * nowtime is written into the registers of the CMOS clock, it will * jump to the next second precisely 500 ms later. Check the Motorola * MC146818A or Dallas DS12887 data sheet for details. */ int mach_set_cmos_time(const struct timespec64 *now) { unsigned long long nowtime = now->tv_sec; struct rtc_time tm; int retval = 0; rtc_time64_to_tm(nowtime, &tm); if (!rtc_valid_tm(&tm)) { retval = mc146818_set_time(&tm); if (retval) printk(KERN_ERR "%s: RTC write failed with error %d\n", __func__, retval); } else { printk(KERN_ERR "%s: Invalid RTC value: write of %llx to RTC failed\n", __func__, nowtime); retval = -EINVAL; } return retval; } void mach_get_cmos_time(struct timespec64 *now) { struct rtc_time tm; /* * If pm_trace abused the RTC as storage, set the timespec to 0, * which tells the caller that this RTC value is unusable. */ if (!pm_trace_rtc_valid()) { now->tv_sec = now->tv_nsec = 0; return; } if (mc146818_get_time(&tm, 1000)) { pr_err("Unable to read current time from RTC\n"); now->tv_sec = now->tv_nsec = 0; return; } now->tv_sec = rtc_tm_to_time64(&tm); now->tv_nsec = 0; } /* Routines for accessing the CMOS RAM/RTC. */ unsigned char rtc_cmos_read(unsigned char addr) { unsigned char val; lock_cmos_prefix(addr); outb(addr, RTC_PORT(0)); val = inb(RTC_PORT(1)); lock_cmos_suffix(addr); return val; } EXPORT_SYMBOL(rtc_cmos_read); void rtc_cmos_write(unsigned char val, unsigned char addr) { lock_cmos_prefix(addr); outb(addr, RTC_PORT(0)); outb(val, RTC_PORT(1)); lock_cmos_suffix(addr); } EXPORT_SYMBOL(rtc_cmos_write); int update_persistent_clock64(struct timespec64 now) { return x86_platform.set_wallclock(&now); } /* not static: needed by APM */ void read_persistent_clock64(struct timespec64 *ts) { x86_platform.get_wallclock(ts); } static struct resource rtc_resources[] = { [0] = { .start = RTC_PORT(0), .end = RTC_PORT(1), .flags = IORESOURCE_IO, }, [1] = { .start = RTC_IRQ, .end = RTC_IRQ, .flags = IORESOURCE_IRQ, } }; static struct platform_device rtc_device = { .name = "rtc_cmos", .id = -1, .resource = rtc_resources, .num_resources = ARRAY_SIZE(rtc_resources), }; static __init int add_rtc_cmos(void) { #ifdef CONFIG_PNP static const char * const ids[] __initconst = { "PNP0b00", "PNP0b01", "PNP0b02", }; struct pnp_dev *dev; int i; pnp_for_each_dev(dev) { for (i = 0; i < ARRAY_SIZE(ids); i++) { if (compare_pnp_id(dev->id, ids[i]) != 0) return 0; } } #endif if (!x86_platform.legacy.rtc) return -ENODEV; platform_device_register(&rtc_device); dev_info(&rtc_device.dev, "registered platform RTC device (no PNP device found)\n"); return 0; } device_initcall(add_rtc_cmos); |
| 1 47 47 66 66 66 66 64 33 28 56 27 32 22 7 2 6 33 118 1 117 117 6 111 111 53 64 64 64 110 13 8 8 7 7 117 66 66 66 66 66 66 66 34 60 60 60 13 47 12 12 4 51 43 11 11 11 9 44 44 51 51 51 1 50 157 156 159 159 1 71 159 28 159 159 159 159 159 159 143 157 157 4 44 157 155 157 2 158 3 156 158 7 6 7 7 7 7 1 1 6 1 1 1 6 6 6 6 6 6 6 6 19 19 15 124 122 10 124 122 39 8 3 116 120 34 15 3 1 115 117 117 111 31 116 81 46 117 117 100 31 117 125 125 107 24 124 109 16 125 125 39 117 123 176 56 1 125 10 114 114 72 92 391 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 | // SPDX-License-Identifier: GPL-2.0 #ifndef NO_BCACHEFS_FS #include "bcachefs.h" #include "alloc_foreground.h" #include "bkey_buf.h" #include "fs-io.h" #include "fs-io-buffered.h" #include "fs-io-direct.h" #include "fs-io-pagecache.h" #include "io_read.h" #include "io_write.h" #include <linux/backing-dev.h> #include <linux/pagemap.h> #include <linux/writeback.h> static inline bool bio_full(struct bio *bio, unsigned len) { if (bio->bi_vcnt >= bio->bi_max_vecs) return true; if (bio->bi_iter.bi_size > UINT_MAX - len) return true; return false; } /* readpage(s): */ static void bch2_readpages_end_io(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK); bio_put(bio); } struct readpages_iter { struct address_space *mapping; unsigned idx; folios folios; }; static int readpages_iter_init(struct readpages_iter *iter, struct readahead_control *ractl) { struct folio *folio; *iter = (struct readpages_iter) { ractl->mapping }; while ((folio = __readahead_folio(ractl))) { if (!bch2_folio_create(folio, GFP_KERNEL) || darray_push(&iter->folios, folio)) { bch2_folio_release(folio); ractl->_nr_pages += folio_nr_pages(folio); ractl->_index -= folio_nr_pages(folio); return iter->folios.nr ? 0 : -ENOMEM; } folio_put(folio); } return 0; } static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) { if (iter->idx >= iter->folios.nr) return NULL; return iter->folios.data[iter->idx]; } static inline void readpage_iter_advance(struct readpages_iter *iter) { iter->idx++; } static bool extent_partial_reads_expensive(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *i; bkey_for_each_crc(k.k, ptrs, crc, i) if (crc.csum_type || crc.compression_type) return true; return false; } static int readpage_bio_extend(struct btree_trans *trans, struct readpages_iter *iter, struct bio *bio, unsigned sectors_this_extent, bool get_more) { /* Don't hold btree locks while allocating memory: */ bch2_trans_unlock(trans); while (bio_sectors(bio) < sectors_this_extent && bio->bi_vcnt < bio->bi_max_vecs) { struct folio *folio = readpage_iter_peek(iter); int ret; if (folio) { readpage_iter_advance(iter); } else { pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; if (!get_more) break; unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio); if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping)) break; unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); /* ensure proper alignment */ order = min(order, __ffs(folio_offset|BIT(31))); folio = xa_load(&iter->mapping->i_pages, folio_offset); if (folio && !xa_is_value(folio)) break; folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order); if (!folio) break; if (!__bch2_folio_create(folio, GFP_KERNEL)) { folio_put(folio); break; } ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); if (ret) { __bch2_folio_release(folio); folio_put(folio); break; } folio_put(folio); } BUG_ON(folio_sector(folio) != bio_end_sector(bio)); BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); } return bch2_trans_relock(trans); } static void bchfs_read(struct btree_trans *trans, struct bch_read_bio *rbio, subvol_inum inum, struct readpages_iter *readpages_iter) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; int flags = BCH_READ_retry_if_stale| BCH_READ_may_promote; int ret = 0; rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inum.inum, rbio->bio.bi_iter.bi_sector), BTREE_ITER_slots); while (1) { struct bkey_s_c k; unsigned bytes, sectors; s64 offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); u32 snapshot; ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) goto err; bch2_btree_iter_set_snapshot(trans, &iter, snapshot); bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, rbio->bio.bi_iter.bi_sector)); k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; bch2_bkey_buf_reassemble(&sk, c, k); ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &sk); if (ret) goto err; k = bkey_i_to_s_c(sk.k); sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); if (readpages_iter) { ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, extent_partial_reads_expensive(k)); if (ret) goto err; } bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) flags |= BCH_READ_last_fragment; bch2_bio_page_state_set(&rbio->bio, k); bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); swap(rbio->bio.bi_iter.bi_size, bytes); if (flags & BCH_READ_last_fragment) break; bio_advance(&rbio->bio, bytes); err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } bch2_trans_iter_exit(trans, &iter); if (ret) { struct printbuf buf = PRINTBUF; lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); prt_printf(&buf, "read error %i from btree lookup", ret); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); } bch2_bkey_buf_exit(&sk, c); } void bch2_readahead(struct readahead_control *ractl) { struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts; struct folio *folio; struct readpages_iter readpages_iter; struct blk_plug plug; bch2_inode_opts_get(&opts, c, &inode->ei_inode); int ret = readpages_iter_init(&readpages_iter, ractl); if (ret) return; /* * Besides being a general performance optimization, plugging helps with * avoiding btree transaction srcu warnings - submitting a bio can * block, and we don't want todo that with the transaction locked. * * However, plugged bios are submitted when we schedule; we ideally * would have our own scheduler hook to call unlock_long() before * scheduling. */ blk_start_plug(&plug); bch2_pagecache_add_get(inode); struct btree_trans *trans = bch2_trans_get(c); while ((folio = readpage_iter_peek(&readpages_iter))) { unsigned n = min_t(unsigned, readpages_iter.folios.nr - readpages_iter.idx, BIO_MAX_VECS); struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, GFP_KERNEL, &c->bio_read), c, opts, bch2_readpages_end_io); readpage_iter_advance(&readpages_iter); rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); bchfs_read(trans, rbio, inode_inum(inode), &readpages_iter); bch2_trans_unlock(trans); } bch2_trans_put(trans); bch2_pagecache_add_put(inode); blk_finish_plug(&plug); darray_exit(&readpages_iter.folios); } static void bch2_read_single_folio_end_io(struct bio *bio) { complete(bio->bi_private); } int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_read_bio *rbio; struct bch_io_opts opts; struct blk_plug plug; int ret; DECLARE_COMPLETION_ONSTACK(done); BUG_ON(folio_test_uptodate(folio)); BUG_ON(folio_test_dirty(folio)); if (!bch2_folio_create(folio, GFP_KERNEL)) return -ENOMEM; bch2_inode_opts_get(&opts, c, &inode->ei_inode); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), c, opts, bch2_read_single_folio_end_io); rbio->bio.bi_private = &done; rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); blk_start_plug(&plug); bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); blk_finish_plug(&plug); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); bio_put(&rbio->bio); if (ret < 0) return ret; folio_mark_uptodate(folio); return 0; } int bch2_read_folio(struct file *file, struct folio *folio) { int ret; ret = bch2_read_single_folio(folio, folio->mapping); folio_unlock(folio); return bch2_err_class(ret); } /* writepages: */ struct bch_writepage_io { struct bch_inode_info *inode; /* must be last: */ struct bch_write_op op; }; struct bch_writepage_state { struct bch_writepage_io *io; struct bch_io_opts opts; struct bch_folio_sector *tmp; unsigned tmp_sectors; }; static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, struct bch_inode_info *inode) { struct bch_writepage_state ret = { 0 }; bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); return ret; } /* * Determine when a writepage io is full. We have to limit writepage bios to a * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to * what the bounce path in bch2_write_extent() can handle. In theory we could * loosen this restriction for non-bounce I/O, but we don't have that context * here. Ideally, we can up this limit and make it configurable in the future * when the bounce path can be enhanced to accommodate larger source bios. */ static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len) { struct bio *bio = &io->op.wbio.bio; return bio_full(bio, len) || (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE); } static void bch2_writepage_io_done(struct bch_write_op *op) { struct bch_writepage_io *io = container_of(op, struct bch_writepage_io, op); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; struct folio_iter fi; unsigned i; if (io->op.error) { set_bit(EI_INODE_ERROR, &io->inode->ei_flags); bio_for_each_folio_all(fi, bio) { struct bch_folio *s; mapping_set_error(fi.folio->mapping, -EIO); s = __bch2_folio(fi.folio); spin_lock(&s->lock); for (i = 0; i < folio_sectors(fi.folio); i++) s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_wrote_data_inline) { bio_for_each_folio_all(fi, bio) { struct bch_folio *s; s = __bch2_folio(fi.folio); spin_lock(&s->lock); for (i = 0; i < folio_sectors(fi.folio); i++) s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } /* * racing with fallocate can cause us to add fewer sectors than * expected - but we shouldn't add more sectors than expected: */ WARN_ON_ONCE(io->op.i_sectors_delta > 0); /* * (error (due to going RO) halfway through a page can screw that up * slightly) * XXX wtf? BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); */ /* * The writeback flag is effectively our ref on the inode - * fixup i_blocks before calling folio_end_writeback: */ bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); bio_for_each_folio_all(fi, bio) { struct bch_folio *s = __bch2_folio(fi.folio); if (atomic_dec_and_test(&s->write_count)) folio_end_writeback(fi.folio); } bio_put(&io->op.wbio.bio); } static void bch2_writepage_do_io(struct bch_writepage_state *w) { struct bch_writepage_io *io = w->io; w->io = NULL; closure_call(&io->op.cl, bch2_write, NULL, NULL); } /* * Get a bch_writepage_io and add @page to it - appending to an existing one if * possible, else allocating a new one: */ static void bch2_writepage_io_alloc(struct bch_fs *c, struct writeback_control *wbc, struct bch_writepage_state *w, struct bch_inode_info *inode, u64 sector, unsigned nr_replicas) { struct bch_write_op *op; w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, REQ_OP_WRITE, GFP_KERNEL, &c->writepage_bioset), struct bch_writepage_io, op.wbio.bio); w->io->inode = inode; op = &w->io->op; bch2_write_op_init(op, c, w->opts); op->target = w->opts.foreground_target; op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); op->subvol = inode->ei_inum.subvol; op->pos = POS(inode->v.i_ino, sector); op->end_io = bch2_writepage_io_done; op->devs_need_flush = &inode->ei_devs_need_flush; op->wbio.bio.bi_iter.bi_sector = sector; op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); } static int __bch2_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_writepage_state *w = data; struct bch_folio *s; unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; loff_t i_size = i_size_read(&inode->v); int ret; EBUG_ON(!folio_test_uptodate(folio)); /* Is the folio fully inside i_size? */ if (folio_end_pos(folio) <= i_size) goto do_io; /* Is the folio fully outside i_size? (truncate in progress) */ if (folio_pos(folio) >= i_size) { folio_unlock(folio); return 0; } /* * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the folio size. For a file that is not a multiple of * the folio size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ folio_zero_segment(folio, i_size - folio_pos(folio), folio_size(folio)); do_io: f_sectors = folio_sectors(folio); s = bch2_folio(folio); if (f_sectors > w->tmp_sectors) { kfree(w->tmp); w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL); w->tmp_sectors = f_sectors; } /* * Things get really hairy with errors during writeback: */ ret = bch2_get_folio_disk_reservation(c, inode, folio, false); BUG_ON(ret); /* Before unlocking the page, get copy of reservations: */ spin_lock(&s->lock); memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); for (i = 0; i < f_sectors; i++) { if (s->s[i].state < SECTOR_dirty) continue; nr_replicas_this_write = min_t(unsigned, nr_replicas_this_write, s->s[i].nr_replicas + s->s[i].replicas_reserved); } for (i = 0; i < f_sectors; i++) { if (s->s[i].state < SECTOR_dirty) continue; s->s[i].nr_replicas = w->opts.compression ? 0 : nr_replicas_this_write; s->s[i].replicas_reserved = 0; bch2_folio_sector_set(folio, s, i, SECTOR_allocated); } spin_unlock(&s->lock); BUG_ON(atomic_read(&s->write_count)); atomic_set(&s->write_count, 1); BUG_ON(folio_test_writeback(folio)); folio_start_writeback(folio); folio_unlock(folio); offset = 0; while (1) { unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; u64 sector; while (offset < f_sectors && w->tmp[offset].state < SECTOR_dirty) offset++; if (offset == f_sectors) break; while (offset + sectors < f_sectors && w->tmp[offset + sectors].state >= SECTOR_dirty) { reserved_sectors += w->tmp[offset + sectors].replicas_reserved; dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; sectors++; } BUG_ON(!sectors); sector = folio_sector(folio) + offset; if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || bch_io_full(w->io, sectors << 9) || bio_end_sector(&w->io->op.wbio.bio) != sector)) bch2_writepage_do_io(w); if (!w->io) bch2_writepage_io_alloc(c, wbc, w, inode, sector, nr_replicas_this_write); atomic_inc(&s->write_count); BUG_ON(inode != w->io->inode); BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, sectors << 9, offset << 9)); w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; w->io->op.new_i_size = i_size; offset += sectors; } if (atomic_dec_and_test(&s->write_count)) folio_end_writeback(folio); return 0; } int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct bch_fs *c = mapping->host->i_sb->s_fs_info; struct bch_writepage_state w = bch_writepage_state_init(c, to_bch_ei(mapping->host)); struct blk_plug plug; int ret; blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); if (w.io) bch2_writepage_do_io(&w); blk_finish_plug(&plug); kfree(w.tmp); return bch2_err_class(ret); } /* buffered writes: */ int bch2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation *res; struct folio *folio; unsigned offset; int ret = -ENOMEM; res = kmalloc(sizeof(*res), GFP_KERNEL); if (!res) return -ENOMEM; bch2_folio_reservation_init(c, inode, res); *fsdata = res; bch2_pagecache_add_get(inode); folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN | fgf_set_order(len), mapping_gfp_mask(mapping)); if (IS_ERR(folio)) goto err_unlock; offset = pos - folio_pos(folio); len = min_t(size_t, len, folio_end_pos(folio) - pos); if (folio_test_uptodate(folio)) goto out; /* If we're writing entire folio, don't need to read it in first: */ if (!offset && len == folio_size(folio)) goto out; if (!offset && pos + len >= inode->v.i_size) { folio_zero_segment(folio, len, folio_size(folio)); flush_dcache_folio(folio); goto out; } if (folio_pos(folio) >= inode->v.i_size) { folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); flush_dcache_folio(folio); goto out; } readpage: ret = bch2_read_single_folio(folio, mapping); if (ret) goto err; out: ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); if (ret) goto err; ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); if (ret) { if (!folio_test_uptodate(folio)) { /* * If the folio hasn't been read in, we won't know if we * actually need a reservation - we don't actually need * to read here, we just need to check if the folio is * fully backed by uncompressed data: */ goto readpage; } goto err; } *foliop = folio; return 0; err: folio_unlock(folio); folio_put(folio); err_unlock: bch2_pagecache_add_put(inode); kfree(res); *fsdata = NULL; return bch2_err_class(ret); } int bch2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation *res = fsdata; unsigned offset = pos - folio_pos(folio); lockdep_assert_held(&inode->v.i_rwsem); BUG_ON(offset + copied > folio_size(folio)); if (unlikely(copied < len && !folio_test_uptodate(folio))) { /* * The folio needs to be read in, but that would destroy * our partial write - simplest thing is to just force * userspace to redo the write: */ folio_zero_range(folio, 0, folio_size(folio)); flush_dcache_folio(folio); copied = 0; } spin_lock(&inode->v.i_lock); if (pos + copied > inode->v.i_size) i_size_write(&inode->v, pos + copied); spin_unlock(&inode->v.i_lock); if (copied) { if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); bch2_set_folio_dirty(c, inode, folio, res, offset, copied); inode->ei_last_dirtied = (unsigned long) current; } folio_unlock(folio); folio_put(folio); bch2_pagecache_add_put(inode); bch2_folio_reservation_put(c, inode, res); kfree(res); return copied; } static noinline void folios_trunc(folios *fs, struct folio **fi) { while (fs->data + fs->nr > fi) { struct folio *f = darray_pop(fs); folio_unlock(f); folio_put(f); } } static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, struct iov_iter *iter, loff_t pos, unsigned len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; folios fs; struct folio *f; unsigned copied = 0, f_offset, f_copied; u64 end = pos + len, f_pos, f_len; loff_t last_folio_pos = inode->v.i_size; int ret = 0; BUG_ON(!len); bch2_folio_reservation_init(c, inode, &res); darray_init(&fs); ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, FGP_WRITEBEGIN | fgf_set_order(len), mapping_gfp_mask(mapping), &fs); if (ret) goto out; BUG_ON(!fs.nr); f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); if (ret) goto out; } f = darray_last(fs); end = min(end, folio_end_pos(f)); last_folio_pos = folio_pos(f); if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { if (end >= inode->v.i_size) { folio_zero_range(f, 0, folio_size(f)); } else { ret = bch2_read_single_folio(f, mapping); if (ret) goto out; } } ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr); if (ret) goto out; f_pos = pos; f_offset = pos - folio_pos(darray_first(fs)); darray_for_each(fs, fi) { ssize_t f_reserved; f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len); if (unlikely(f_reserved != f_len)) { if (f_reserved < 0) { if (f == darray_first(fs)) { ret = f_reserved; goto out; } folios_trunc(&fs, fi); end = min(end, folio_end_pos(darray_last(fs))); } else { if (!folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); if (ret) goto out; } folios_trunc(&fs, fi + 1); end = f_pos + f_reserved; } break; } f_pos = folio_end_pos(f); f_offset = 0; } if (mapping_writably_mapped(mapping)) darray_for_each(fs, fi) flush_dcache_folio(*fi); f_pos = pos; f_offset = pos - folio_pos(darray_first(fs)); darray_for_each(fs, fi) { f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); if (!f_copied) { folios_trunc(&fs, fi); break; } if (!folio_test_uptodate(f) && f_copied != folio_size(f) && pos + copied + f_copied < inode->v.i_size) { iov_iter_revert(iter, f_copied); folio_zero_range(f, 0, folio_size(f)); folios_trunc(&fs, fi); break; } flush_dcache_folio(f); copied += f_copied; if (f_copied != f_len) { folios_trunc(&fs, fi + 1); break; } f_pos = folio_end_pos(f); f_offset = 0; } if (!copied) goto out; end = pos + copied; spin_lock(&inode->v.i_lock); if (end > inode->v.i_size) i_size_write(&inode->v, end); spin_unlock(&inode->v.i_lock); f_pos = pos; f_offset = pos - folio_pos(darray_first(fs)); darray_for_each(fs, fi) { f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; if (!folio_test_uptodate(f)) folio_mark_uptodate(f); bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); f_pos = folio_end_pos(f); f_offset = 0; } inode->ei_last_dirtied = (unsigned long) current; out: darray_for_each(fs, fi) { folio_unlock(*fi); folio_put(*fi); } /* * If the last folio added to the mapping starts beyond current EOF, we * performed a short write but left around at least one post-EOF folio. * Clean up the mapping before we return. */ if (last_folio_pos >= inode->v.i_size) truncate_pagecache(&inode->v, inode->v.i_size); darray_exit(&fs); bch2_folio_reservation_put(c, inode, &res); return copied ?: ret; } static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); loff_t pos = iocb->ki_pos; ssize_t written = 0; int ret = 0; bch2_pagecache_add_get(inode); do { unsigned offset = pos & (PAGE_SIZE - 1); unsigned bytes = iov_iter_count(iter); again: /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. * * Not only is this an optimisation, but it is also required * to check that the address is actually valid, when atomic * usercopies are used, below. */ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { bytes = min_t(unsigned long, iov_iter_count(iter), PAGE_SIZE - offset); if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { ret = -EFAULT; break; } } if (unlikely(fatal_signal_pending(current))) { ret = -EINTR; break; } ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); if (unlikely(ret < 0)) break; cond_resched(); if (unlikely(ret == 0)) { /* * If we were unable to copy any data at all, we must * fall back to a single segment length write. * * If we didn't fallback here, we could livelock * because not all segments in the iov can be copied at * once without a pagefault. */ bytes = min_t(unsigned long, PAGE_SIZE - offset, iov_iter_single_seg_count(iter)); goto again; } pos += ret; written += ret; ret = 0; balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(iter)); bch2_pagecache_add_put(inode); return written ? written : ret; } ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct bch_inode_info *inode = file_bch_inode(file); ssize_t ret; if (iocb->ki_flags & IOCB_DIRECT) { ret = bch2_direct_write(iocb, from); goto out; } inode_lock(&inode->v); ret = generic_write_checks(iocb, from); if (ret <= 0) goto unlock; ret = file_remove_privs(file); if (ret) goto unlock; ret = file_update_time(file); if (ret) goto unlock; ret = bch2_buffered_write(iocb, from); if (likely(ret > 0)) iocb->ki_pos += ret; unlock: inode_unlock(&inode->v); if (ret > 0) ret = generic_write_sync(iocb, ret); out: return bch2_err_class(ret); } void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) { bioset_exit(&c->writepage_bioset); } int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { if (bioset_init(&c->writepage_bioset, 4, offsetof(struct bch_writepage_io, op.wbio.bio), BIOSET_NEED_BVECS)) return -BCH_ERR_ENOMEM_writepage_bioset_init; return 0; } #endif /* NO_BCACHEFS_FS */ |
| 1 27 3 27 27 27 22 5 19 50 31 19 19 16 3 2 14 5 38 38 38 7 3 4 1 1 22 243 242 23 22 38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 | // SPDX-License-Identifier: GPL-2.0-only /* Event cache for netfilter. */ /* * (C) 2005 Harald Welte <laforge@gnumonks.org> * (C) 2005 Patrick McHardy <kaber@trash.net> * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/export.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> static DEFINE_MUTEX(nf_ct_ecache_mutex); #define DYING_NULLS_VAL ((1 << 30) + 1) #define ECACHE_MAX_JIFFIES msecs_to_jiffies(10) #define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10) enum retry_state { STATE_CONGESTED, STATE_RESTART, STATE_DONE, }; struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); return &cnet->ecache; } #if IS_MODULE(CONFIG_NF_CT_NETLINK) EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache); #endif static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet) { unsigned long stop = jiffies + ECACHE_MAX_JIFFIES; struct hlist_nulls_head evicted_list; enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int sent; INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL); next: sent = 0; spin_lock_bh(&cnet->ecache.dying_lock); hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); /* The worker owns all entries, ct remains valid until nf_ct_put * in the loop below. */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; } hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list); if (time_after(stop, jiffies)) { ret = STATE_RESTART; break; } if (sent++ > 16) { spin_unlock_bh(&cnet->ecache.dying_lock); cond_resched(); goto next; } } spin_unlock_bh(&cnet->ecache.dying_lock); hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); nf_ct_put(ct); cond_resched(); } return ret; } static void ecache_work(struct work_struct *work) { struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work); int ret, delay = -1; ret = ecache_work_evict_list(cnet); switch (ret) { case STATE_CONGESTED: delay = ECACHE_RETRY_JIFFIES; break; case STATE_RESTART: delay = 0; break; case STATE_DONE: break; } if (delay >= 0) schedule_delayed_work(&cnet->ecache.dwork, delay); } static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, const u32 events, const u32 missed, const struct nf_ct_event *item) { struct net *net = nf_ct_net(item->ct); struct nf_ct_event_notifier *notify; u32 old, want; int ret; if (!((events | missed) & e->ctmask)) return 0; rcu_read_lock(); notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) { rcu_read_unlock(); return 0; } ret = notify->ct_event(events | missed, item); rcu_read_unlock(); if (likely(ret >= 0 && missed == 0)) return 0; do { old = READ_ONCE(e->missed); if (ret < 0) want = old | events; else want = old & ~missed; } while (cmpxchg(&e->missed, old, want) != old); return ret; } static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP if (local64_read(&e->timestamp)) local64_set(&e->timestamp, ktime_get_real_ns()); #endif } int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, u32 portid, int report) { struct nf_conntrack_ecache *e; struct nf_ct_event item; unsigned int missed; int ret; if (!nf_ct_is_confirmed(ct)) return 0; e = nf_ct_ecache_find(ct); if (!e) return 0; memset(&item, 0, sizeof(item)); item.ct = ct; item.portid = e->portid ? e->portid : portid; item.report = report; /* This is a resent of a destroy event? If so, skip missed */ missed = e->portid ? 0 : e->missed; nf_ct_ecache_tstamp_refresh(e); ret = __nf_conntrack_eventmask_report(e, events, missed, &item); if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { /* This is a destroy event that has been triggered by a process, * we store the PORTID to include it in the retransmission. */ if (e->portid == 0 && portid != 0) e->portid = portid; } return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) { struct nf_conntrack_ecache *e; struct nf_ct_event item; unsigned int events; if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) return; e = nf_ct_ecache_find(ct); if (e == NULL) return; events = xchg(&e->cache, 0); item.ct = ct; item.portid = 0; item.report = 0; /* We make a copy of the missed event cache without taking * the lock, thus we may send missed events twice. However, * this does not harm and it happens very rarely. */ __nf_conntrack_eventmask_report(e, events, e->missed, &item); } EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, struct nf_conntrack_expect *exp, u32 portid, int report) { struct net *net = nf_ct_exp_net(exp); struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; rcu_read_lock(); notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) goto out_unlock; e = nf_ct_ecache_find(exp->master); if (!e) goto out_unlock; if (e->expmask & (1 << event)) { struct nf_exp_event item = { .exp = exp, .portid = portid, .report = report }; notify->exp_event(1 << event, &item); } out_unlock: rcu_read_unlock(); } void nf_conntrack_register_notifier(struct net *net, const struct nf_ct_event_notifier *new) { struct nf_ct_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, lockdep_is_held(&nf_ct_ecache_mutex)); WARN_ON_ONCE(notify); rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); mutex_unlock(&nf_ct_ecache_mutex); } EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); void nf_conntrack_unregister_notifier(struct net *net) { mutex_lock(&nf_ct_ecache_mutex); RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); /* synchronize_rcu() is called after netns pre_exit */ } EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (state == NFCT_ECACHE_DESTROY_FAIL && !delayed_work_pending(&cnet->ecache.dwork)) { schedule_delayed_work(&cnet->ecache.dwork, HZ); net->ct.ecache_dwork_pending = true; } else if (state == NFCT_ECACHE_DESTROY_SENT) { if (!hlist_nulls_empty(&cnet->ecache.dying_list)) mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); else net->ct.ecache_dwork_pending = false; } } static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP u64 ts = 0; if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) ts = ktime_get_real_ns(); local64_set(&e->timestamp, ts); #endif } bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *e; switch (net->ct.sysctl_events) { case 0: /* assignment via template / ruleset? ignore sysctl. */ if (ctmask || expmask) break; return true; case 2: /* autodetect: no event listener, don't allocate extension. */ if (!READ_ONCE(nf_ctnetlink_has_listener)) return true; fallthrough; case 1: /* always allocate an extension. */ if (!ctmask && !expmask) { ctmask = ~0; expmask = ~0; } break; default: WARN_ON_ONCE(1); return true; } e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); if (e) { nf_ct_ecache_tstamp_new(ct, e); e->ctmask = ctmask; e->expmask = expmask; } return e != NULL; } EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add); #define NF_CT_EVENTS_DEFAULT 2 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; void nf_conntrack_ecache_pernet_init(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); net->ct.sysctl_events = nf_ct_events; INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work); INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL); spin_lock_init(&cnet->ecache.dying_lock); BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */ } void nf_conntrack_ecache_pernet_fini(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); cancel_delayed_work_sync(&cnet->ecache.dwork); } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 | /* SPDX-License-Identifier: GPL-2.0 */ /* * NFS internal definitions */ #include "nfs4_fs.h" #include <linux/fs_context.h> #include <linux/security.h> #include <linux/compiler_attributes.h> #include <linux/crc32.h> #include <linux/sunrpc/addr.h> #include <linux/nfs_page.h> #include <linux/nfslocalio.h> #include <linux/wait_bit.h> #define NFS_SB_MASK (SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; struct nfs_string; struct nfs_pageio_descriptor; static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) { if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; } static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) { if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) return 0; return 1; } static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry) { if (!(NFS_SB(dentry->d_sb)->flags & NFS_MOUNT_SOFTREVAL)) return false; if (!d_is_positive(dentry) || !NFS_FH(d_inode(dentry))->size) return false; return true; } static inline fmode_t flags_to_mode(int flags) { fmode_t res = (__force fmode_t)flags & FMODE_EXEC; if ((flags & O_ACCMODE) != O_WRONLY) res |= FMODE_READ; if ((flags & O_ACCMODE) != O_RDONLY) res |= FMODE_WRITE; return res; } /* * Note: RFC 1813 doesn't limit the number of auth flavors that * a server can return, so make something up. */ #define NFS_MAX_SECFLAVORS (12) /* * Value used if the user did not specify a port value. */ #define NFS_UNSPEC_PORT (-1) #define NFS_UNSPEC_RETRANS (UINT_MAX) #define NFS_UNSPEC_TIMEO (UINT_MAX) struct nfs_client_initdata { unsigned long init_flags; const char *hostname; /* Hostname of the server */ const struct sockaddr_storage *addr; /* Address of the server */ const char *nodename; /* Hostname of the client */ const char *ip_addr; /* IP address of the client */ size_t addrlen; struct nfs_subversion *nfs_mod; int proto; u32 minorversion; unsigned int nconnect; unsigned int max_connect; struct net *net; const struct rpc_timeout *timeparms; const struct cred *cred; struct xprtsec_parms xprtsec; unsigned long connect_timeout; unsigned long reconnect_timeout; }; /* * In-kernel mount arguments */ struct nfs_fs_context { bool internal; bool skip_reconfig_option_check; bool need_mount; bool sloppy; unsigned int flags; /* NFS{,4}_MOUNT_* flags */ unsigned int rsize, wsize; unsigned int timeo, retrans; unsigned int acregmin, acregmax; unsigned int acdirmin, acdirmax; unsigned int namlen; unsigned int options; unsigned int bsize; struct nfs_auth_info auth_info; rpc_authflavor_t selected_flavor; struct xprtsec_parms xprtsec; char *client_address; unsigned int version; unsigned int minorversion; char *fscache_uniq; unsigned short protofamily; unsigned short mountfamily; bool has_sec_mnt_opts; int lock_status; struct { union { struct sockaddr address; struct sockaddr_storage _address; }; size_t addrlen; char *hostname; u32 version; int port; unsigned short protocol; } mount_server; struct { union { struct sockaddr address; struct sockaddr_storage _address; }; size_t addrlen; char *hostname; char *export_path; int port; unsigned short protocol; unsigned short nconnect; unsigned short max_connect; unsigned short export_path_len; } nfs_server; struct nfs_fh *mntfh; struct nfs_server *server; struct nfs_subversion *nfs_mod; /* Information for a cloned mount. */ struct nfs_clone_mount { struct super_block *sb; struct dentry *dentry; struct nfs_fattr *fattr; unsigned int inherited_bsize; } clone_data; }; enum nfs_lock_status { NFS_LOCK_NOT_SET = 0, NFS_LOCK_LOCK = 1, NFS_LOCK_NOLOCK = 2, }; #define nfs_errorf(fc, fmt, ...) ((fc)->log.log ? \ errorf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); })) #define nfs_ferrorf(fc, fac, fmt, ...) ((fc)->log.log ? \ errorf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); })) #define nfs_invalf(fc, fmt, ...) ((fc)->log.log ? \ invalf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); -EINVAL; })) #define nfs_finvalf(fc, fac, fmt, ...) ((fc)->log.log ? \ invalf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); -EINVAL; })) #define nfs_warnf(fc, fmt, ...) ((fc)->log.log ? \ warnf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); })) #define nfs_fwarnf(fc, fac, fmt, ...) ((fc)->log.log ? \ warnf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); })) static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc) { return fc->fs_private; } /* mount_clnt.c */ struct nfs_mount_request { struct sockaddr_storage *sap; size_t salen; char *hostname; char *dirpath; u32 version; unsigned short protocol; struct nfs_fh *fh; int noresvport; unsigned int *auth_flav_len; rpc_authflavor_t *auth_flavs; struct net *net; }; extern int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans); extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); extern void nfs_clients_exit(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); struct nfs_client *nfs_get_client(const struct nfs_client_initdata *); int nfs_probe_server(struct nfs_server *, struct nfs_fh *); void nfs_server_insert_lists(struct nfs_server *); void nfs_server_remove_lists(struct nfs_server *); void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans); int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, rpc_authflavor_t); struct nfs_server *nfs_alloc_server(void); void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *); extern void nfs_put_client(struct nfs_client *); extern void nfs_free_client(struct nfs_client *); extern struct nfs_client *nfs4_find_client_ident(struct net *, int); extern struct nfs_client * nfs4_find_client_sessionid(struct net *, const struct sockaddr *, struct nfs4_sessionid *, u32); extern struct nfs_server *nfs_create_server(struct fs_context *); extern void nfs4_server_set_init_caps(struct nfs_server *); extern struct nfs_server *nfs4_create_server(struct fs_context *); extern struct nfs_server *nfs4_create_referral_server(struct fs_context *); extern int nfs4_update_server(struct nfs_server *server, const char *hostname, struct sockaddr_storage *sap, size_t salen, struct net *net); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); extern bool nfs_client_init_is_complete(const struct nfs_client *clp); extern int nfs_client_init_status(const struct nfs_client *clp); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); extern int nfs_fs_proc_net_init(struct net *net); extern void nfs_fs_proc_net_exit(struct net *net); #else static inline int nfs_fs_proc_net_init(struct net *net) { return 0; } static inline void nfs_fs_proc_net_exit(struct net *net) { } static inline int nfs_fs_proc_init(void) { return 0; } static inline void nfs_fs_proc_exit(void) { } #endif /* callback_xdr.c */ extern const struct svc_version nfs4_callback_version1; extern const struct svc_version nfs4_callback_version4; /* fs_context.c */ extern struct file_system_type nfs_fs_type; /* pagelist.c */ extern int __init nfs_init_nfspagecache(void); extern void nfs_destroy_nfspagecache(void); extern int __init nfs_init_readpagecache(void); extern void nfs_destroy_readpagecache(void); extern int __init nfs_init_writepagecache(void); extern void nfs_destroy_writepagecache(void); extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); void nfs_pgio_header_free(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, const struct cred *cred, const struct nfs_rpc_ops *rpc_ops, const struct rpc_call_ops *call_ops, int how, int flags, struct nfsd_file *localio); void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { return cred_fscmp(ctx1->cred, ctx2->cred) == 0 && ctx1->state == ctx2->state; } /* nfs2xdr.c */ extern const struct rpc_procinfo nfs_procedures[]; extern int nfs2_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); /* nfs3xdr.c */ extern const struct rpc_procinfo nfs3_procedures[]; extern int nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); /* nfs4xdr.c */ #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); #endif #ifdef CONFIG_NFS_V4_1 extern const u32 nfs41_maxread_overhead; extern const u32 nfs41_maxwrite_overhead; extern const u32 nfs41_maxgetdevinfo_overhead; #endif /* nfs4proc.c */ #if IS_ENABLED(CONFIG_NFS_V4) extern const struct rpc_procinfo nfs4_procedures[]; #endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); static inline struct nfs4_label * nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) { if (!dst || !src) return NULL; if (src->len > NFS4_MAXLABELLEN) return NULL; dst->lfs = src->lfs; dst->pi = src->pi; dst->len = src->len; memcpy(dst->label, src->label, src->len); return dst; } static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL)) nfsi->cache_validity |= NFS_INO_INVALID_LABEL; } #else static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { } static inline struct nfs4_label * nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) { return NULL; } #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); /* dir.c */ extern void nfs_readdir_record_entry_cache_hit(struct inode *dir); extern void nfs_readdir_record_entry_cache_miss(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); void nfs_d_prune_case_insensitive_aliases(struct inode *inode); int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); struct dentry *nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int nfs_link(struct dentry *, struct inode *, struct dentry *); int nfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int nfs_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); #ifdef CONFIG_NFS_V4_2 static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) { if (!(server->caps & NFS_CAP_XATTR)) return 0; return NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | NFS4_ACCESS_XALIST; } #else static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) { return 0; } #endif /* file.c */ int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); ssize_t nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); int nfs_file_mmap(struct file *, struct vm_area_struct *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; extern struct workqueue_struct *nfslocaliod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_free_inode(struct inode *); extern int nfs_write_inode(struct inode *, struct writeback_control *); extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); extern void nfs_zap_acl_cache(struct inode *inode); extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); #if IS_ENABLED(CONFIG_NFS_LOCALIO) /* localio.c */ extern void nfs_local_probe(struct nfs_client *); extern void nfs_local_probe_async(struct nfs_client *); extern void nfs_local_probe_async_work(struct work_struct *); extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *, const struct cred *, struct nfs_fh *, struct nfs_file_localio *, const fmode_t); extern int nfs_local_doio(struct nfs_client *, struct nfsd_file *, struct nfs_pgio_header *, const struct rpc_call_ops *); extern int nfs_local_commit(struct nfsd_file *, struct nfs_commit_data *, const struct rpc_call_ops *, int); extern bool nfs_server_is_local(const struct nfs_client *clp); #else /* CONFIG_NFS_LOCALIO */ static inline void nfs_local_probe(struct nfs_client *clp) {} static inline void nfs_local_probe_async(struct nfs_client *clp) {} static inline struct nfsd_file * nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, struct nfs_file_localio *nfl, const fmode_t mode) { return NULL; } static inline int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops) { return -EINVAL; } static inline int nfs_local_commit(struct nfsd_file *localio, struct nfs_commit_data *data, const struct rpc_call_ops *call_ops, int how) { return -EINVAL; } static inline bool nfs_server_is_local(const struct nfs_client *clp) { return false; } #endif /* CONFIG_NFS_LOCALIO */ /* super.c */ extern const struct super_operations nfs_sops; bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); int nfs_try_get_tree(struct fs_context *); int nfs_get_tree_common(struct fs_context *); void nfs_kill_super(struct super_block *); extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); extern int nfs_client_for_each_server(struct nfs_client *clp, int (*fn)(struct nfs_server *, void *), void *data); #ifdef CONFIG_NFS_FSCACHE extern const struct netfs_request_ops nfs_netfs_ops; #endif /* io.c */ extern __must_check int nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); extern __must_check int nfs_start_io_write(struct inode *inode); extern void nfs_end_io_write(struct inode *inode); extern __must_check int nfs_start_io_direct(struct inode *inode); extern void nfs_end_io_direct(struct inode *inode); static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) { return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, unsigned flags); extern struct vfsmount *nfs_d_automount(struct path *path); int nfs_submount(struct fs_context *, struct nfs_server *); int nfs_do_submount(struct fs_context *); /* getroot.c */ extern int nfs_get_root(struct super_block *s, struct fs_context *fc); #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool); #endif struct nfs_pgio_completion_ops; /* read.c */ extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); extern bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size); extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, struct nfs_open_context *ctx, struct folio *folio); extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); /* super.c */ void nfs_umount_begin(struct super_block *); int nfs_statfs(struct dentry *, struct kstatfs *); int nfs_show_options(struct seq_file *, struct dentry *); int nfs_show_devname(struct seq_file *, struct dentry *); int nfs_show_path(struct seq_file *, struct dentry *); int nfs_show_stats(struct seq_file *, struct dentry *); int nfs_reconfigure(struct fs_context *); /* write.c */ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_commit_free(struct nfs_commit_data *p); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, const struct nfs_rpc_ops *nfs_ops, const struct rpc_call_ops *call_ops, int how, int flags, struct nfsd_file *localio); extern void nfs_init_commit(struct nfs_commit_data *data, struct list_head *head, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo); int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max); unsigned long nfs_reqs_to_commit(struct nfs_commit_info *); int nfs_scan_commit(struct inode *inode, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx); void nfs_commitdata_release(struct nfs_commit_data *data); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_request_remove_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_init_cinfo(struct nfs_commit_info *cinfo, struct inode *inode, struct nfs_direct_req *dreq); int nfs_key_timeout_notify(struct file *filp, struct inode *inode); bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); int nfs_filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); #ifdef CONFIG_NFS_V4_1 static inline void pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets, unsigned int nbuckets) { unsigned int i; for (i = 0; i < nbuckets; i++) buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; } static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { struct pnfs_commit_array *array; rcu_read_lock(); list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list) pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets, array->nbuckets); rcu_read_unlock(); } #else static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { } #endif #ifdef CONFIG_MIGRATION int nfs_migrate_folio(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); #else #define nfs_migrate_folio NULL #endif static inline int nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, const struct nfs_write_verifier *v2) { return memcmp(v1->data, v2->data, sizeof(v1->data)); } static inline bool nfs_write_match_verf(const struct nfs_writeverf *verf, struct nfs_page *req) { return verf->committed > NFS_UNSTABLE && !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); } static inline gfp_t nfs_io_gfp_mask(void) { if (current->flags & PF_WQ_WORKER) return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; return GFP_KERNEL; } /* * Special version of should_remove_suid() that ignores capabilities. */ static inline int nfs_should_remove_suid(const struct inode *inode) { umode_t mode = inode->i_mode; int kill = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; /* * sgid without any exec bits is just a mandatory locking mark; leave * it alone. If some exec bits are set, it's a real sgid; kill it. */ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) kill |= ATTR_KILL_SGID; if (unlikely(kill && S_ISREG(mode))) return kill; return 0; } /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, struct dentry *old_dentry, struct dentry *new_dentry, void (*complete)(struct rpc_task *, struct nfs_renamedata *)); extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset); /* nfs4proc.c */ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, struct nfs_client **result, const struct cred *cred); extern int nfs41_walk_client_list(struct nfs_client *clp, struct nfs_client **result, const struct cred *cred); extern void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data); static inline struct inode *nfs_igrab_and_active(struct inode *inode) { struct super_block *sb = inode->i_sb; if (sb && nfs_sb_active(sb)) { if (igrab(inode)) return inode; nfs_sb_deactive(sb); } return NULL; } static inline void nfs_iput_and_deactive(struct inode *inode) { if (inode != NULL) { struct super_block *sb = inode->i_sb; iput(inode); nfs_sb_deactive(sb); } } /* * Determine the device name as a string */ static inline char *nfs_devname(struct dentry *dentry, char *buffer, ssize_t buflen) { char *dummy; return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL); } /* * Determine the actual block size (and log2 thereof) */ static inline unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) { /* make sure blocksize is a power of two */ if ((bsize & (bsize - 1)) || nrbitsp) { unsigned char nrbits; for (nrbits = 31; nrbits && !(bsize & (1UL << nrbits)); nrbits--) ; bsize = 1UL << nrbits; if (nrbitsp) *nrbitsp = nrbits; } return bsize; } /* * Calculate the number of 512byte blocks used. */ static inline blkcnt_t nfs_calc_block_size(u64 tsize) { blkcnt_t used = (tsize + 511) >> 9; return (used > ULONG_MAX) ? ULONG_MAX : used; } /* * Compute and set NFS server blocksize */ static inline unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) { if (bsize < NFS_MIN_FILE_IO_SIZE) bsize = NFS_DEF_FILE_IO_SIZE; else if (bsize >= NFS_MAX_FILE_IO_SIZE) bsize = NFS_MAX_FILE_IO_SIZE; return nfs_block_bits(bsize, nrbitsp); } /* * Compute and set NFS server rsize / wsize */ static inline unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto) { if (iosize < NFS_MIN_FILE_IO_SIZE) iosize = NFS_DEF_FILE_IO_SIZE; else if (iosize >= NFS_MAX_FILE_IO_SIZE) iosize = NFS_MAX_FILE_IO_SIZE; if (proto == XPRT_TRANSPORT_UDP || iosize < PAGE_SIZE) return nfs_block_bits(iosize, NULL); return iosize & PAGE_MASK; } /* * Determine the maximum file size for a superblock */ static inline void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) { sb->s_maxbytes = (loff_t)maxfilesize; if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0) sb->s_maxbytes = MAX_LFS_FILESIZE; } /* * Record the page as unstable (an extra writeback period) and mark its * inode as dirty. */ static inline void nfs_folio_mark_unstable(struct folio *folio, struct nfs_commit_info *cinfo) { if (folio && !cinfo->dreq) { struct inode *inode = folio->mapping->host; long nr = folio_nr_pages(folio); /* This page is really still in write-back - just that the * writeback is happening on the server now. */ node_stat_mod_folio(folio, NR_WRITEBACK, nr); wb_stat_mod(&inode_to_bdi(inode)->wb, WB_WRITEBACK, nr); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } /* * Determine the number of bytes of data the page contains */ static inline size_t nfs_folio_length(struct folio *folio) { loff_t i_size = i_size_read(folio->mapping->host); if (i_size > 0) { pgoff_t index = folio->index >> folio_order(folio); pgoff_t end_index = (i_size - 1) >> folio_shift(folio); if (index < end_index) return folio_size(folio); if (index == end_index) return offset_in_folio(folio, i_size - 1) + 1; } return 0; } /* * Convert a umode to a dirent->d_type */ static inline unsigned char nfs_umode_to_dtype(umode_t mode) { return (mode >> 12) & 15; } /* * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ static inline unsigned int nfs_page_array_len(unsigned int base, size_t len) { return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> PAGE_SHIFT; } /* * Convert a struct timespec64 into a 64-bit change attribute * * This does approximately the same thing as timespec64_to_ns(), * but for calculation efficiency, we multiply the seconds by * 1024*1024*1024. */ static inline u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) { return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } #ifdef CONFIG_CRC32 static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } #else static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) { return 0; } #endif static inline bool nfs_current_task_exiting(void) { return (current->flags & PF_EXITING) != 0; } static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: case -EINTR: case -EACCES: case -EDQUOT: case -EFBIG: case -EIO: case -ENOSPC: case -EROFS: case -ESTALE: case -E2BIG: case -ENOMEM: case -ETIMEDOUT: return true; default: return false; } } static inline bool nfs_error_is_fatal_on_server(int err) { switch (err) { case 0: case -ERESTARTSYS: case -EINTR: case -ENOMEM: return false; } return nfs_error_is_fatal(err); } /* * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ static inline void nfs_set_port(struct sockaddr_storage *sap, int *port, const unsigned short default_port) { if (*port == NFS_UNSPEC_PORT) *port = default_port; rpc_set_port((struct sockaddr *)sap, *port); } struct nfs_direct_req { struct kref kref; /* release manager */ /* I/O parameters */ struct nfs_open_context *ctx; /* file open context info */ struct nfs_lock_context *l_ctx; /* Lock context info */ struct kiocb * iocb; /* controlling i/o request */ struct inode * inode; /* target file of i/o */ /* completion state */ atomic_t io_count; /* i/os we're waiting for */ spinlock_t lock; /* protect completion state */ loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ /* commit state */ struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ struct work_struct work; int flags; /* for write */ #define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ /* for read */ #define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ #define NFS_ODIRECT_DONE INT_MAX /* write verification failed */ }; |
| 427 425 238 269 165 351 83 397 39 411 132 131 1 131 131 126 126 1 6 104 101 3 3 271 19 241 241 241 235 7 50 15 36 257 257 270 1 270 222 1 224 223 224 131 3 130 129 60 1 24 38 59 2 37 1 52 23 45 21 26 59 59 4 11 178 82 115 5 1 110 2 4 177 178 178 177 4 11 26 25 25 15 15 15 11 11 1 1 2 9 16 16 16 16 13 1 16 16 1 15 16 9 3 1 1 1 4 3 11 11 38 7 20 12 37 37 1 35 1 35 1 1 32 26 6 4 3 6 2 33 31 22 9 16 15 20 17 37 37 67 10 66 10 67 6 69 8 20 58 12 66 60 1 15 71 68 10 68 10 71 2 70 61 15 2 68 6 65 2 70 80 79 1 78 2 79 1 79 1 79 1 88 88 2 1 73 19 15 1 14 12 9 8 9 3 1 15 1 14 14 14 1 11 1 12 12 12 12 4 12 12 2 11 14 14 4 4 1 11 11 8 2 92 33 1 19 1 19 4 7 7 4 1 1 1 1 1 1 1 1 255 245 11 109 170 12 5 100 99 99 50 28 23 50 28 23 50 6 6 14 14 5 9 9 1 497 2 1 1 1 1 12 386 498 4 2 2 502 2 502 502 2 109 367 20 388 2 1 386 93 6 122 123 123 110 1 99 103 268 268 527 524 450 4 452 22 14 2 8 2 2 11 1 536 534 84 1 83 1 83 390 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 | // SPDX-License-Identifier: GPL-2.0 #ifndef NO_BCACHEFS_FS #include "bcachefs.h" #include "acl.h" #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" #include "chardev.h" #include "dirent.h" #include "errcode.h" #include "extents.h" #include "fs.h" #include "fs-io.h" #include "fs-ioctl.h" #include "fs-io-buffered.h" #include "fs-io-direct.h" #include "fs-io-pagecache.h" #include "fsck.h" #include "inode.h" #include "io_read.h" #include "journal.h" #include "keylist.h" #include "namei.h" #include "quota.h" #include "rebalance.h" #include "snapshot.h" #include "super.h" #include "xattr.h" #include "trace.h" #include <linux/aio.h> #include <linux/backing-dev.h> #include <linux/exportfs.h> #include <linux/fiemap.h> #include <linux/fs_context.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/posix_acl.h> #include <linux/random.h> #include <linux/seq_file.h> #include <linux/siphash.h> #include <linux/statfs.h> #include <linux/string.h> #include <linux/xattr.h> static struct kmem_cache *bch2_inode_cache; static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_info *, struct bch_inode_unpacked *, struct bch_subvolume *); void bch2_inode_update_after_write(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, unsigned fields) { struct bch_fs *c = trans->c; BUG_ON(bi->bi_inum != inode->v.i_ino); bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); i_gid_write(&inode->v, bi->bi_gid); inode->v.i_mode = bi->bi_mode; if (fields & ATTR_SIZE) i_size_write(&inode->v, bi->bi_size); if (fields & ATTR_ATIME) inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); if (fields & ATTR_MTIME) inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); if (fields & ATTR_CTIME) inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); inode->ei_inode = *bi; bch2_inode_flags_to_vfs(inode); } int __must_check bch2_write_inode(struct bch_fs *c, struct bch_inode_info *inode, inode_set_fn set, void *p, unsigned fields) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = {}; struct bch_inode_unpacked inode_u; int ret; retry: bch2_trans_begin(trans); ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); if (ret) goto err; struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); ret = (set ? set(trans, inode, &inode_u, p) : 0); if (ret) goto err; struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); if (memcmp(&old_r, &new_r, sizeof(new_r))) { ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); if (ret) goto err; } ret = bch2_inode_write(trans, &iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); /* * the btree node lock protects inode->ei_inode, not ei_update_lock; * this is important for inode updates via bchfs_write_index_update */ if (!ret) bch2_inode_update_after_write(trans, inode, &inode_u, fields); err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, "%s: inode %llu:%llu not found when updating", bch2_err_str(ret), inode_inum(inode).subvol, inode_inum(inode).inum); bch2_trans_put(trans); return ret < 0 ? ret : 0; } int bch2_fs_quota_transfer(struct bch_fs *c, struct bch_inode_info *inode, struct bch_qid new_qid, unsigned qtypes, enum quota_acct_mode mode) { unsigned i; int ret; qtypes &= enabled_qtypes(c); for (i = 0; i < QTYP_NR; i++) if (new_qid.q[i] == inode->ei_qid.q[i]) qtypes &= ~(1U << i); if (!qtypes) return 0; mutex_lock(&inode->ei_quota_lock); ret = bch2_quota_transfer(c, qtypes, new_qid, inode->ei_qid, inode->v.i_blocks + inode->ei_quota_reserved, mode); if (!ret) for (i = 0; i < QTYP_NR; i++) if (qtypes & (1 << i)) inode->ei_qid.q[i] = new_qid.q[i]; mutex_unlock(&inode->ei_quota_lock); return ret; } static bool subvol_inum_eq(subvol_inum a, subvol_inum b) { return a.subvol == b.subvol && a.inum == b.inum; } static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) { const subvol_inum *inum = data; siphash_key_t k = { .key[0] = seed }; return siphash_2u64(inum->subvol, inum->inum, &k); } static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) { const struct bch_inode_info *inode = data; return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); } static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { const struct bch_inode_info *inode = obj; const subvol_inum *v = arg->key; return !subvol_inum_eq(inode->ei_inum, *v); } static const struct rhashtable_params bch2_vfs_inodes_params = { .head_offset = offsetof(struct bch_inode_info, hash), .key_offset = offsetof(struct bch_inode_info, ei_inum), .key_len = sizeof(subvol_inum), .hashfn = bch2_vfs_inode_hash_fn, .obj_hashfn = bch2_vfs_inode_obj_hash_fn, .obj_cmpfn = bch2_vfs_inode_cmp_fn, .automatic_shrinking = true, }; static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { .head_offset = offsetof(struct bch_inode_info, by_inum_hash), .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), .key_len = sizeof(u64), .automatic_shrinking = true, }; int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { struct bch_fs *c = trans->c; struct rhltable *ht = &c->vfs_inodes_by_inum_table; u64 inum = p.offset; DARRAY(u32) subvols; int ret = 0; if (!test_bit(BCH_FS_started, &c->flags)) return false; darray_init(&subvols); restart_from_top: /* * Tweaked version of __rhashtable_lookup(); we need to get a list of * subvolumes in which the given inode number is open. * * For this to work, we don't include the subvolume ID in the key that * we hash - all inodes with the same inode number regardless of * subvolume will hash to the same slot. * * This will be less than ideal if the same file is ever open * simultaneously in many different snapshots: */ rcu_read_lock(); struct rhash_lock_head __rcu *const *bkt; struct rhash_head *he; unsigned int hash; struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); restart: hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); bkt = rht_bucket(tbl, hash); do { struct bch_inode_info *inode; rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { if (inode->ei_inum.inum == inum) { ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, GFP_NOWAIT|__GFP_NOWARN); if (ret) { rcu_read_unlock(); ret = darray_make_room(&subvols, 1); if (ret) goto err; subvols.nr = 0; goto restart_from_top; } } } /* An object might have been moved to a different hash chain, * while we walk along it - better check and retry. */ } while (he != RHT_NULLS_MARKER(bkt)); /* Ensure we see any new tables. */ smp_rmb(); tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); if (unlikely(tbl)) goto restart; rcu_read_unlock(); darray_for_each(subvols, i) { u32 snap; ret = bch2_subvolume_get_snapshot(trans, *i, &snap); if (ret) goto err; ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); if (ret) break; } err: darray_exit(&subvols); return ret; } static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) { return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); } static void __wait_on_freeing_inode(struct bch_fs *c, struct bch_inode_info *inode, subvol_inum inum) { wait_queue_head_t *wq; struct wait_bit_queue_entry wait; wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->v.i_lock); if (__bch2_inode_hash_find(c, inum) == inode) schedule_timeout(HZ * 10); finish_wait(wq, &wait.wq_entry); } static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, subvol_inum inum) { struct bch_inode_info *inode; repeat: inode = __bch2_inode_hash_find(c, inum); if (inode) { spin_lock(&inode->v.i_lock); if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { spin_unlock(&inode->v.i_lock); return NULL; } if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { if (!trans) { __wait_on_freeing_inode(c, inode, inum); } else { bch2_trans_unlock(trans); __wait_on_freeing_inode(c, inode, inum); int ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret); } goto repeat; } __iget(&inode->v); spin_unlock(&inode->v.i_lock); } return inode; } static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) { spin_lock(&inode->v.i_lock); bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); spin_unlock(&inode->v.i_lock); if (remove) { int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); BUG_ON(ret); ret = rhashtable_remove_fast(&c->vfs_inodes_table, &inode->hash, bch2_vfs_inodes_params); BUG_ON(ret); inode->v.i_hash.pprev = NULL; /* * This pairs with the bch2_inode_hash_find() -> * __wait_on_freeing_inode() path */ inode_wake_up_bit(&inode->v, __I_NEW); } } static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, struct btree_trans *trans, struct bch_inode_info *inode) { struct bch_inode_info *old = inode; set_bit(EI_INODE_HASHED, &inode->ei_flags); retry: if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, &inode->ei_inum, &inode->hash, bch2_vfs_inodes_params))) { old = bch2_inode_hash_find(c, trans, inode->ei_inum); if (!old) goto retry; clear_bit(EI_INODE_HASHED, &inode->ei_flags); /* * bcachefs doesn't use I_NEW; we have no use for it since we * only insert fully created inodes in the inode hash table. But * discard_new_inode() expects it to be set... */ inode->v.i_state |= I_NEW; /* * We don't want bch2_evict_inode() to delete the inode on disk, * we just raced and had another inode in cache. Normally new * inodes don't have nlink == 0 - except tmpfiles do... */ set_nlink(&inode->v, 1); discard_new_inode(&inode->v); return old; } else { int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); BUG_ON(ret); inode_fake_hash(&inode->v); inode_sb_list_add(&inode->v); mutex_lock(&c->vfs_inodes_lock); list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); mutex_unlock(&c->vfs_inodes_lock); return inode; } } #define memalloc_flags_do(_flags, _do) \ ({ \ unsigned _saved_flags = memalloc_flags_save(_flags); \ typeof(_do) _ret = _do; \ memalloc_noreclaim_restore(_saved_flags); \ _ret; \ }) static struct inode *bch2_alloc_inode(struct super_block *sb) { BUG(); } static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) { struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, bch2_inode_cache, gfp); if (!inode) return NULL; inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); two_state_lock_init(&inode->ei_pagecache_lock); INIT_LIST_HEAD(&inode->ei_vfs_inode_list); inode->ei_flags = 0; mutex_init(&inode->ei_quota_lock); memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { kmem_cache_free(bch2_inode_cache, inode); return NULL; } return inode; } /* * Allocate a new inode, dropping/retaking btree locks if necessary: */ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) { struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); if (unlikely(!inode)) { int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); if (ret && inode) { __destroy_inode(&inode->v); kmem_cache_free(bch2_inode_cache, inode); } if (ret) return ERR_PTR(ret); } return inode; } static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { struct bch_inode_info *inode = bch2_new_inode(trans); if (IS_ERR(inode)) return inode; bch2_vfs_inode_init(trans, inum, inode, bi, subvol); return bch2_inode_hash_insert(trans->c, trans, inode); } struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); if (inode) return &inode->v; struct btree_trans *trans = bch2_trans_get(c); struct bch_inode_unpacked inode_u; struct bch_subvolume subvol; int ret = lockrestart_do(trans, bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); bch2_trans_put(trans); return ret ? ERR_PTR(ret) : &inode->v; } struct bch_inode_info * __bch2_create(struct mnt_idmap *idmap, struct bch_inode_info *dir, struct dentry *dentry, umode_t mode, dev_t rdev, subvol_inum snapshot_src, unsigned flags) { struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans *trans; struct bch_inode_unpacked dir_u; struct bch_inode_info *inode; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; subvol_inum inum; struct bch_subvolume subvol; u64 journal_seq = 0; kuid_t kuid; kgid_t kgid; int ret; /* * preallocate acls + vfs inode before btree transaction, so that * nothing can fail after the transaction succeeds: */ #ifdef CONFIG_BCACHEFS_POSIX_ACL ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); if (ret) return ERR_PTR(ret); #endif inode = __bch2_new_inode(c, GFP_NOFS); if (unlikely(!inode)) { inode = ERR_PTR(-ENOMEM); goto err; } bch2_inode_init_early(c, &inode_u); if (!(flags & BCH_CREATE_TMPFILE)) mutex_lock(&dir->ei_update_lock); trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: bch2_create_trans(trans, inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) ? &dentry->d_name : NULL, from_kuid(i_user_ns(&dir->v), kuid), from_kgid(i_user_ns(&dir->v), kgid), mode, rdev, default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC); if (unlikely(ret)) goto err_before_quota; inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; inum.inum = inode_u.bi_inum; ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); err_before_quota: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; goto err_trans; } if (!(flags & BCH_CREATE_TMPFILE)) { bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); mutex_unlock(&dir->ei_update_lock); } bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); /* * we must insert the new inode into the inode cache before calling * bch2_trans_exit() and dropping locks, else we could race with another * thread pulling the inode in and modifying it: * * also, calling bch2_inode_hash_insert() without passing in the * transaction object is sketchy - if we could ever end up in * __wait_on_freeing_inode(), we'd risk deadlock. * * But that shouldn't be possible, since we still have the inode locked * that we just created, and we _really_ can't take a transaction * restart here. */ inode = bch2_inode_hash_insert(c, NULL, inode); bch2_trans_put(trans); err: posix_acl_release(default_acl); posix_acl_release(acl); return inode; err_trans: if (!(flags & BCH_CREATE_TMPFILE)) mutex_unlock(&dir->ei_update_lock); bch2_trans_put(trans); make_bad_inode(&inode->v); iput(&inode->v); inode = ERR_PTR(ret); goto err; } /* methods */ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, subvol_inum dir, struct bch_hash_info *dir_hash_info, const struct qstr *name) { struct bch_fs *c = trans->c; struct btree_iter dirent_iter = {}; subvol_inum inum = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, dir_hash_info, dir, name, 0); int ret = bkey_err(k); if (ret) return ERR_PTR(ret); struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ret = bch2_dirent_read_target(trans, dir, d, &inum); if (ret > 0) ret = -ENOENT; if (ret) goto err; struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); if (inode) goto out; /* * Note: if check/repair needs it, we commit before * bch2_inode_hash_init_insert(), as after that point we can't take a * restart - not in the top level loop with a commit_do(), like we * usually do: */ struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); /* * don't remove it: check_inodes might find another inode that points * back to this dirent */ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "dirent to missing inode:\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); if (ret) goto err; out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); return inode; err: inode = ERR_PTR(ret); goto out; } static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, unsigned int flags) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); struct bch_inode_info *inode; bch2_trans_do(c, PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), &hash, &dentry->d_name))); if (IS_ERR(inode)) inode = NULL; #ifdef CONFIG_UNICODE if (!inode && IS_CASEFOLDED(vdir)) { /* * Do not cache a negative dentry in casefolded directories * as it would need to be invalidated in the following situation: * - Lookup file "blAH" in a casefolded directory * - Creation of file "BLAH" in a casefolded directory * - Lookup file "blAH" in a casefolded directory * which would fail if we had a negative dentry. * * We should come back to this when VFS has a method to handle * this edgecase. */ return NULL; } #endif return d_splice_alias(&inode->v, dentry); } static int bch2_mknod(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct bch_inode_info *inode = __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, (subvol_inum) { 0 }, 0); if (IS_ERR(inode)) return bch2_err_class(PTR_ERR(inode)); d_instantiate(dentry, &inode->v); return 0; } static int bch2_create(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, umode_t mode, bool excl) { return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); } static int __bch2_link(struct bch_fs *c, struct bch_inode_info *inode, struct bch_inode_info *dir, struct dentry *dentry) { struct bch_inode_unpacked dir_u, inode_u; int ret; mutex_lock(&inode->ei_update_lock); struct btree_trans *trans = bch2_trans_get(c); ret = commit_do(trans, NULL, NULL, 0, bch2_link_trans(trans, inode_inum(dir), &dir_u, inode_inum(inode), &inode_u, &dentry->d_name)); if (likely(!ret)) { bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); } bch2_trans_put(trans); mutex_unlock(&inode->ei_update_lock); return ret; } static int bch2_link(struct dentry *old_dentry, struct inode *vdir, struct dentry *dentry) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); int ret; lockdep_assert_held(&inode->v.i_rwsem); ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) return bch2_err_class(ret); ihold(&inode->v); d_instantiate(dentry, &inode->v); return 0; } int __bch2_unlink(struct inode *vdir, struct dentry *dentry, bool deleting_snapshot) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_inode_unpacked dir_u, inode_u; int ret; bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); struct btree_trans *trans = bch2_trans_get(c); ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_unlink_trans(trans, inode_inum(dir), &dir_u, &inode_u, &dentry->d_name, deleting_snapshot)); if (unlikely(ret)) goto err; bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_MTIME); if (inode_u.bi_subvol) { /* * Subvolume deletion is asynchronous, but we still want to tell * the VFS that it's been deleted here: */ set_nlink(&inode->v, 0); } err: bch2_trans_put(trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); return ret; } static int bch2_unlink(struct inode *vdir, struct dentry *dentry) { struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_fs *c = dir->v.i_sb->s_fs_info; int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: __bch2_unlink(vdir, dentry, false); return bch2_err_class(ret); } static int bch2_symlink(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, const char *symname) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir), *inode; int ret; inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return bch2_err_class(PTR_ERR(inode)); inode_lock(&inode->v); ret = page_symlink(&inode->v, symname, strlen(symname) + 1); inode_unlock(&inode->v); if (unlikely(ret)) goto err; ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); if (unlikely(ret)) goto err; ret = __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) goto err; d_instantiate(dentry, &inode->v); return 0; err: iput(&inode->v); return bch2_err_class(ret); } static struct dentry *bch2_mkdir(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, umode_t mode) { return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0)); } static int bch2_rename2(struct mnt_idmap *idmap, struct inode *src_vdir, struct dentry *src_dentry, struct inode *dst_vdir, struct dentry *dst_dentry, unsigned flags) { struct bch_fs *c = src_vdir->i_sb->s_fs_info; struct bch_inode_info *src_dir = to_bch_ei(src_vdir); struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); struct bch_inode_unpacked dst_dir_u, src_dir_u; struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; struct btree_trans *trans; enum bch_rename_mode mode = flags & RENAME_EXCHANGE ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode ? BCH_RENAME_OVERWRITE : BCH_RENAME; bool whiteout = !!(flags & RENAME_WHITEOUT); int ret; if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) return -EINVAL; if (mode == BCH_RENAME_OVERWRITE) { ret = filemap_write_and_wait_range(src_inode->v.i_mapping, 0, LLONG_MAX); if (ret) return ret; } bch2_lock_inodes(INODE_UPDATE_LOCK, src_dir, dst_dir, src_inode, dst_inode); trans = bch2_trans_get(c); ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); if (ret) goto err_tx_restart; if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { ret = bch2_fs_quota_transfer(c, src_inode, dst_dir->ei_qid, 1 << QTYP_PRJ, KEY_TYPE_QUOTA_PREALLOC); if (ret) goto err; } if (mode == BCH_RENAME_EXCHANGE && inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { ret = bch2_fs_quota_transfer(c, dst_inode, src_dir->ei_qid, 1 << QTYP_PRJ, KEY_TYPE_QUOTA_PREALLOC); if (ret) goto err; } retry: bch2_trans_begin(trans); ret = bch2_rename_trans(trans, inode_inum(src_dir), &src_dir_u, inode_inum(dst_dir), &dst_dir_u, &src_inode_u, &dst_inode_u, &src_dentry->d_name, &dst_dentry->d_name, mode); if (unlikely(ret)) goto err_tx_restart; if (whiteout) { whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); ret = PTR_ERR_OR_ZERO(whiteout_inode_u); if (unlikely(ret)) goto err_tx_restart; bch2_inode_init_early(c, whiteout_inode_u); ret = bch2_create_trans(trans, inode_inum(src_dir), &src_dir_u, whiteout_inode_u, &src_dentry->d_name, from_kuid(i_user_ns(&src_dir->v), current_fsuid()), from_kgid(i_user_ns(&src_dir->v), current_fsgid()), S_IFCHR|WHITEOUT_MODE, 0, NULL, NULL, (subvol_inum) { 0 }, 0) ?: bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC); if (unlikely(ret)) goto err_tx_restart; } ret = bch2_trans_commit(trans, NULL, NULL, 0); if (unlikely(ret)) { err_tx_restart: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; goto err; } BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); BUG_ON(dst_inode && dst_inode->v.i_ino != dst_inode_u.bi_inum); bch2_inode_update_after_write(trans, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); if (src_dir != dst_dir) bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); bch2_inode_update_after_write(trans, src_inode, &src_inode_u, ATTR_CTIME); if (dst_inode) bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, ATTR_CTIME); err: bch2_trans_put(trans); bch2_fs_quota_transfer(c, src_inode, bch_qid(&src_inode->ei_inode), 1 << QTYP_PRJ, KEY_TYPE_QUOTA_NOCHECK); if (dst_inode) bch2_fs_quota_transfer(c, dst_inode, bch_qid(&dst_inode->ei_inode), 1 << QTYP_PRJ, KEY_TYPE_QUOTA_NOCHECK); bch2_unlock_inodes(INODE_UPDATE_LOCK, src_dir, dst_dir, src_inode, dst_inode); return bch2_err_class(ret); } static void bch2_setattr_copy(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, struct iattr *attr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; unsigned int ia_valid = attr->ia_valid; kuid_t kuid; kgid_t kgid; if (ia_valid & ATTR_UID) { kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); } if (ia_valid & ATTR_GID) { kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); } if (ia_valid & ATTR_SIZE) bi->bi_size = attr->ia_size; if (ia_valid & ATTR_ATIME) bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); if (ia_valid & ATTR_MTIME) bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); if (ia_valid & ATTR_CTIME) bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; kgid_t gid = ia_valid & ATTR_GID ? kgid : inode->v.i_gid; if (!in_group_or_capable(idmap, &inode->v, make_vfsgid(idmap, i_user_ns(&inode->v), gid))) mode &= ~S_ISGID; bi->bi_mode = mode; } } int bch2_setattr_nonsize(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct iattr *attr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; struct btree_trans *trans; struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; kuid_t kuid; kgid_t kgid; int ret; mutex_lock(&inode->ei_update_lock); qid = inode->ei_qid; if (attr->ia_valid & ATTR_UID) { kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); } if (attr->ia_valid & ATTR_GID) { kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); } ret = bch2_fs_quota_transfer(c, inode, qid, ~0, KEY_TYPE_QUOTA_PREALLOC); if (ret) goto err; trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); kfree(acl); acl = NULL; ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); if (ret) goto btree_err; bch2_setattr_copy(idmap, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, inode_u.bi_mode, &acl); if (ret) goto btree_err; } ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); btree_err: bch2_trans_iter_exit(trans, &inode_iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (unlikely(ret)) goto err_trans; bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); if (acl) set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); err_trans: bch2_trans_put(trans); err: mutex_unlock(&inode->ei_update_lock); return bch2_err_class(ret); } static int bch2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned query_flags) { struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); stat->dev = inode->v.i_sb->s_dev; stat->ino = inode->v.i_ino; stat->mode = inode->v.i_mode; stat->nlink = inode->v.i_nlink; stat->uid = vfsuid_into_kuid(vfsuid); stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->v.i_rdev; stat->size = i_size_read(&inode->v); stat->atime = inode_get_atime(&inode->v); stat->mtime = inode_get_mtime(&inode->v); stat->ctime = inode_get_ctime(&inode->v); stat->blksize = block_bytes(c); stat->blocks = inode->v.i_blocks; stat->subvol = inode->ei_inum.subvol; stat->result_mask |= STATX_SUBVOL; if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { stat->result_mask |= STATX_DIOALIGN; /* * this is incorrect; we should be tracking this in superblock, * and checking the alignment of open devices */ stat->dio_mem_align = SECTOR_SIZE; stat->dio_offset_align = block_bytes(c); } if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); } if (inode->ei_inode.bi_flags & BCH_INODE_immutable) stat->attributes |= STATX_ATTR_IMMUTABLE; stat->attributes_mask |= STATX_ATTR_IMMUTABLE; if (inode->ei_inode.bi_flags & BCH_INODE_append) stat->attributes |= STATX_ATTR_APPEND; stat->attributes_mask |= STATX_ATTR_APPEND; if (inode->ei_inode.bi_flags & BCH_INODE_nodump) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= STATX_ATTR_NODUMP; return 0; } static int bch2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; int ret; lockdep_assert_held(&inode->v.i_rwsem); ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: setattr_prepare(idmap, dentry, iattr); if (ret) return ret; return iattr->ia_valid & ATTR_SIZE ? bchfs_truncate(idmap, inode, iattr) : bch2_setattr_nonsize(idmap, inode, iattr); } static int bch2_tmpfile(struct mnt_idmap *idmap, struct inode *vdir, struct file *file, umode_t mode) { struct bch_inode_info *inode = __bch2_create(idmap, to_bch_ei(vdir), file->f_path.dentry, mode, 0, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return bch2_err_class(PTR_ERR(inode)); d_mark_tmpfile(file, &inode->v); d_instantiate(file->f_path.dentry, &inode->v); return finish_open_simple(file, 0); } static int bch2_fill_extent(struct bch_fs *c, struct fiemap_extent_info *info, struct bkey_s_c k, unsigned flags) { if (bkey_extent_is_direct_data(k.k)) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; int ret; if (k.k->type == KEY_TYPE_reflink_v) flags |= FIEMAP_EXTENT_SHARED; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int flags2 = 0; u64 offset = p.ptr.offset; if (p.ptr.unwritten) flags2 |= FIEMAP_EXTENT_UNWRITTEN; if (p.crc.compression_type) flags2 |= FIEMAP_EXTENT_ENCODED; else offset += p.crc.offset; if ((offset & (block_sectors(c) - 1)) || (k.k->size & (block_sectors(c) - 1))) flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ret = fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, offset << 9, k.k->size << 9, flags|flags2); if (ret) return ret; } return 0; } else if (bkey_extent_is_inline_data(k.k)) { return fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, 0, k.k->size << 9, flags| FIEMAP_EXTENT_DATA_INLINE); } else if (k.k->type == KEY_TYPE_reservation) { return fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, 0, k.k->size << 9, flags| FIEMAP_EXTENT_DELALLOC| FIEMAP_EXTENT_UNWRITTEN); } else { BUG(); } } static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, u64 start, u64 len) { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *ei = to_bch_ei(vinode); struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; bool have_extent = false; int ret = 0; ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); if (start + len < start) return -EINVAL; start >>= 9; bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); trans = bch2_trans_get(c); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(ei->v.i_ino, start), 0); while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); u32 snapshot; ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); if (ret) continue; bch2_btree_iter_set_snapshot(trans, &iter, snapshot); k = bch2_btree_iter_peek_max(trans, &iter, end); ret = bkey_err(k); if (ret) continue; if (!k.k) break; if (!bkey_extent_is_data(k.k) && k.k->type != KEY_TYPE_reservation) { bch2_btree_iter_advance(trans, &iter); continue; } s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); unsigned sectors = k.k->size - offset_into_extent; bch2_bkey_buf_reassemble(&cur, c, k); ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &cur); if (ret) continue; k = bkey_i_to_s_c(cur.k); bch2_bkey_buf_realloc(&prev, c, k.k->u64s); sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); bch2_cut_front(POS(k.k->p.inode, bkey_start_offset(k.k) + offset_into_extent), cur.k); bch2_key_resize(&cur.k->k, sectors); cur.k->k.p = iter.pos; cur.k->k.p.offset += cur.k->k.size; if (have_extent) { bch2_trans_unlock(trans); ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), 0); if (ret) break; } bkey_copy(prev.k, cur.k); have_extent = true; bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, iter.pos.offset + sectors)); } bch2_trans_iter_exit(trans, &iter); if (!ret && have_extent) { bch2_trans_unlock(trans); ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); } bch2_trans_put(trans); bch2_bkey_buf_exit(&cur, c); bch2_bkey_buf_exit(&prev, c); return ret < 0 ? ret : 0; } static const struct vm_operations_struct bch_vm_ops = { .fault = bch2_page_fault, .map_pages = filemap_map_pages, .page_mkwrite = bch2_page_mkwrite, }; static int bch2_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &bch_vm_ops; return 0; } /* Directories: */ static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) { return generic_file_llseek_size(file, offset, whence, S64_MAX, S64_MAX); } static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; if (!dir_emit_dots(file, ctx)) return 0; int ret = bch2_readdir(c, inode_inum(inode), ctx); bch_err_fn(c, ret); return bch2_err_class(ret); } static int bch2_open(struct inode *vinode, struct file *file) { if (file->f_flags & (O_WRONLY|O_RDWR)) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); if (ret) return ret; } file->f_mode |= FMODE_CAN_ODIRECT; return generic_file_open(vinode, file); } static const struct file_operations bch_file_operations = { .open = bch2_open, .llseek = bch2_llseek, .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = bch2_fallocate_dispatch, .unlocked_ioctl = bch2_fs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { .getattr = bch2_getattr, .setattr = bch2_setattr, .fiemap = bch2_fiemap, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; static const struct inode_operations bch_dir_inode_operations = { .lookup = bch2_lookup, .create = bch2_create, .link = bch2_link, .unlink = bch2_unlink, .symlink = bch2_symlink, .mkdir = bch2_mkdir, .rmdir = bch2_unlink, .mknod = bch2_mknod, .rename = bch2_rename2, .getattr = bch2_getattr, .setattr = bch2_setattr, .tmpfile = bch2_tmpfile, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; static const struct file_operations bch_dir_file_operations = { .llseek = bch2_dir_llseek, .read = generic_read_dir, .iterate_shared = bch2_vfs_readdir, .fsync = bch2_fsync, .unlocked_ioctl = bch2_fs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif }; static const struct inode_operations bch_symlink_inode_operations = { .get_link = page_get_link, .getattr = bch2_getattr, .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; static const struct inode_operations bch_special_inode_operations = { .getattr = bch2_getattr, .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; static const struct address_space_operations bch_address_space_operations = { .read_folio = bch2_read_folio, .writepages = bch2_writepages, .readahead = bch2_readahead, .dirty_folio = filemap_dirty_folio, .write_begin = bch2_write_begin, .write_end = bch2_write_end, .invalidate_folio = bch2_invalidate_folio, .release_folio = bch2_release_folio, #ifdef CONFIG_MIGRATION .migrate_folio = filemap_migrate_folio, #endif .error_remove_folio = generic_error_remove_folio, }; struct bcachefs_fid { u64 inum; u32 subvol; u32 gen; } __packed; struct bcachefs_fid_with_parent { struct bcachefs_fid fid; struct bcachefs_fid dir; } __packed; static int bcachefs_fid_valid(int fh_len, int fh_type) { switch (fh_type) { case FILEID_BCACHEFS_WITHOUT_PARENT: return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); case FILEID_BCACHEFS_WITH_PARENT: return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); default: return false; } } static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) { return (struct bcachefs_fid) { .inum = inode->ei_inum.inum, .subvol = inode->ei_inum.subvol, .gen = inode->ei_inode.bi_generation, }; } static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, struct inode *vdir) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_inode_info *dir = to_bch_ei(vdir); int min_len; if (!S_ISDIR(inode->v.i_mode) && dir) { struct bcachefs_fid_with_parent *fid = (void *) fh; min_len = sizeof(*fid) / sizeof(u32); if (*len < min_len) { *len = min_len; return FILEID_INVALID; } fid->fid = bch2_inode_to_fid(inode); fid->dir = bch2_inode_to_fid(dir); *len = min_len; return FILEID_BCACHEFS_WITH_PARENT; } else { struct bcachefs_fid *fid = (void *) fh; min_len = sizeof(*fid) / sizeof(u32); if (*len < min_len) { *len = min_len; return FILEID_INVALID; } *fid = bch2_inode_to_fid(inode); *len = min_len; return FILEID_BCACHEFS_WITHOUT_PARENT; } } static struct inode *bch2_nfs_get_inode(struct super_block *sb, struct bcachefs_fid fid) { struct bch_fs *c = sb->s_fs_info; struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { .subvol = fid.subvol, .inum = fid.inum, }); if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { iput(vinode); vinode = ERR_PTR(-ESTALE); } return vinode; } static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, int fh_len, int fh_type) { struct bcachefs_fid *fid = (void *) _fid; if (!bcachefs_fid_valid(fh_len, fh_type)) return NULL; return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); } static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, int fh_len, int fh_type) { struct bcachefs_fid_with_parent *fid = (void *) _fid; if (!bcachefs_fid_valid(fh_len, fh_type) || fh_type != FILEID_BCACHEFS_WITH_PARENT) return NULL; return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); } static struct dentry *bch2_get_parent(struct dentry *child) { struct bch_inode_info *inode = to_bch_ei(child->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; subvol_inum parent_inum = { .subvol = inode->ei_inode.bi_parent_subvol ?: inode->ei_inum.subvol, .inum = inode->ei_inode.bi_dir, }; return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); } static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) { struct bch_inode_info *inode = to_bch_ei(child->d_inode); struct bch_inode_info *dir = to_bch_ei(parent->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans *trans; struct btree_iter iter1; struct btree_iter iter2; struct bkey_s_c k; struct bkey_s_c_dirent d; struct bch_inode_unpacked inode_u; subvol_inum target; u32 snapshot; struct qstr dirent_name; unsigned name_len = 0; int ret; if (!S_ISDIR(dir->v.i_mode)) return -EINVAL; trans = bch2_trans_get(c); bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, POS(dir->ei_inode.bi_inum, 0), 0); bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, POS(dir->ei_inode.bi_inum, 0), 0); retry: bch2_trans_begin(trans); ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); if (ret) goto err; bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); if (ret) goto err; if (inode_u.bi_dir == dir->ei_inode.bi_inum) { bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); k = bch2_btree_iter_peek_slot(trans, &iter1); ret = bkey_err(k); if (ret) goto err; if (k.k->type != KEY_TYPE_dirent) { ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; goto err; } d = bkey_s_c_to_dirent(k); ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret > 0) ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; if (ret) goto err; if (subvol_inum_eq(target, inode->ei_inum)) goto found; } else { /* * File with multiple hardlinks and our backref is to the wrong * directory - linear search: */ for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { if (k.k->p.inode > dir->ei_inode.bi_inum) break; if (k.k->type != KEY_TYPE_dirent) continue; d = bkey_s_c_to_dirent(k); ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret < 0) break; if (ret) continue; if (subvol_inum_eq(target, inode->ei_inum)) goto found; } } ret = -ENOENT; goto err; found: dirent_name = bch2_dirent_get_name(d); name_len = min_t(unsigned, dirent_name.len, NAME_MAX); memcpy(name, dirent_name.name, name_len); name[name_len] = '\0'; err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_iter_exit(trans, &iter1); bch2_trans_iter_exit(trans, &iter2); bch2_trans_put(trans); return ret; } static const struct export_operations bch_export_ops = { .encode_fh = bch2_encode_fh, .fh_to_dentry = bch2_fh_to_dentry, .fh_to_parent = bch2_fh_to_parent, .get_parent = bch2_get_parent, .get_name = bch2_get_name, }; static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { inode->v.i_ino = inum.inum; inode->ei_inum = inum; inode->ei_inode.bi_inum = inum.inum; bch2_inode_update_after_write(trans, inode, bi, ~0); inode->v.i_blocks = bi->bi_sectors; inode->v.i_rdev = bi->bi_dev; inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; inode->ei_flags = 0; inode->ei_quota_reserved = 0; inode->ei_qid = bch_qid(bi); if (BCH_SUBVOLUME_SNAP(subvol)) set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); inode->v.i_mapping->a_ops = &bch_address_space_operations; switch (inode->v.i_mode & S_IFMT) { case S_IFREG: inode->v.i_op = &bch_file_inode_operations; inode->v.i_fop = &bch_file_operations; break; case S_IFDIR: inode->v.i_op = &bch_dir_inode_operations; inode->v.i_fop = &bch_dir_file_operations; break; case S_IFLNK: inode_nohighmem(&inode->v); inode->v.i_op = &bch_symlink_inode_operations; break; default: init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); inode->v.i_op = &bch_special_inode_operations; break; } mapping_set_folio_min_order(inode->v.i_mapping, get_order(trans->c->opts.block_size)); } static void bch2_free_inode(struct inode *vinode) { kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); } static int inode_update_times_fn(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); return 0; } static int bch2_vfs_write_inode(struct inode *vinode, struct writeback_control *wbc) { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(vinode); int ret; mutex_lock(&inode->ei_update_lock); ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); return bch2_err_class(ret); } static void bch2_evict_inode(struct inode *vinode) { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(vinode); bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); /* * evict() has waited for outstanding writeback, we'll do no more IO * through this inode: it's safe to remove from VFS inode hashtable here * * Do that now so that other threads aren't blocked from pulling it back * in, there's no reason for them to be: */ if (!delete) bch2_inode_hash_remove(c, inode); truncate_inode_pages_final(&inode->v.i_data); clear_inode(&inode->v); BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); if (delete) { bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); bch2_inode_rm(c, inode_inum(inode)); /* * If we are deleting, we need it present in the vfs hash table * so that fsck can check if unlinked inodes are still open: */ bch2_inode_hash_remove(c, inode); } mutex_lock(&c->vfs_inodes_lock); list_del_init(&inode->ei_vfs_inode_list); mutex_unlock(&c->vfs_inodes_lock); } void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) { struct bch_inode_info *inode; DARRAY(struct bch_inode_info *) grabbed; bool clean_pass = false, this_pass_clean; /* * Initially, we scan for inodes without I_DONTCACHE, then mark them to * be pruned with d_mark_dontcache(). * * Once we've had a clean pass where we didn't find any inodes without * I_DONTCACHE, we wait for them to be freed: */ darray_init(&grabbed); darray_make_room(&grabbed, 1024); again: cond_resched(); this_pass_clean = true; mutex_lock(&c->vfs_inodes_lock); list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) continue; if (!(inode->v.i_state & I_DONTCACHE) && !(inode->v.i_state & I_FREEING) && igrab(&inode->v)) { this_pass_clean = false; if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { iput(&inode->v); break; } } else if (clean_pass && this_pass_clean) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); mutex_unlock(&c->vfs_inodes_lock); schedule(); finish_wait(wq_head, &wqe.wq_entry); goto again; } } mutex_unlock(&c->vfs_inodes_lock); darray_for_each(grabbed, i) { inode = *i; d_mark_dontcache(&inode->v); d_prune_aliases(&inode->v); iput(&inode->v); } grabbed.nr = 0; if (!clean_pass || !this_pass_clean) { clean_pass = this_pass_clean; goto again; } darray_exit(&grabbed); } static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct bch_fs *c = sb->s_fs_info; struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); unsigned shift = sb->s_blocksize_bits - 9; /* * this assumes inodes take up 64 bytes, which is a decent average * number: */ u64 avail_inodes = ((usage.capacity - usage.used) << 3); buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = usage.capacity >> shift; buf->f_bfree = usage.free >> shift; buf->f_bavail = avail_factor(usage.free) >> shift; buf->f_files = usage.nr_inodes + avail_inodes; buf->f_ffree = avail_inodes; buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); buf->f_namelen = BCH_NAME_MAX; return 0; } static int bch2_sync_fs(struct super_block *sb, int wait) { struct bch_fs *c = sb->s_fs_info; int ret; trace_bch2_sync_fs(sb, wait); if (c->opts.journal_flush_disabled) return 0; if (!wait) { bch2_journal_flush_async(&c->journal, NULL); return 0; } ret = bch2_journal_flush(&c->journal); return bch2_err_class(ret); } static struct bch_fs *bch2_path_to_fs(const char *path) { struct bch_fs *c; dev_t dev; int ret; ret = lookup_bdev(path, &dev); if (ret) return ERR_PTR(ret); c = bch2_dev_to_fs(dev); if (c) closure_put(&c->cl); return c ?: ERR_PTR(-ENOENT); } static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; bool first = true; for_each_online_member(c, ca) { if (!first) seq_putc(seq, ':'); first = false; seq_puts(seq, ca->disk_sb.sb_name); } return 0; } static int bch2_show_options(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; struct printbuf buf = PRINTBUF; bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); printbuf_nul_terminate(&buf); seq_printf(seq, ",%s", buf.buf); int ret = buf.allocation_failure ? -ENOMEM : 0; printbuf_exit(&buf); return ret; } static void bch2_put_super(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; __bch2_fs_stop(c); } /* * bcachefs doesn't currently integrate intwrite freeze protection but the * internal write references serve the same purpose. Therefore reuse the * |